diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2014-04-11 07:44:22 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2014-04-11 07:44:22 +0300 |
commit | ebb6772e9eabeb81e3cc9305a6bec7adf7aad450 (patch) | |
tree | 2cf743f82791db19cc7e31cab86b1fc9a4d5ddbb /dfa.c | |
parent | e069c636968370f0899d5e4ebaeb9c2341804245 (diff) | |
parent | a4b59faf911743b30f2e6e979c4f9c1ea0669ac3 (diff) | |
download | egawk-ebb6772e9eabeb81e3cc9305a6bec7adf7aad450.tar.gz egawk-ebb6772e9eabeb81e3cc9305a6bec7adf7aad450.tar.bz2 egawk-ebb6772e9eabeb81e3cc9305a6bec7adf7aad450.zip |
Merge branch 'master' into comment
Diffstat (limited to 'dfa.c')
-rw-r--r-- | dfa.c | 537 |
1 files changed, 309 insertions, 228 deletions
@@ -43,8 +43,15 @@ #include "missing_d/gawkbool.h" #endif /* HAVE_STDBOOL_H */ -#include "dfa.h" - +/* Gawk doesn't use Gnulib, so don't assume that setlocale and + static_assert are present. */ +#ifndef LC_ALL +# define setlocale(category, locale) NULL +#endif +#ifndef static_assert +# define static_assert(cond, diagnostic) \ + extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] +#endif #define STREQ(a, b) (strcmp (a, b) == 0) @@ -84,6 +91,8 @@ #include "xalloc.h" +#include "dfa.h" + #ifdef GAWK static int is_blank (int c) @@ -130,7 +139,7 @@ typedef unsigned int charclass[CHARCLASS_INTS]; /* Convert a possibly-signed character to an unsigned character. This is a bit safer than casting to unsigned char, since it catches some type errors that the cast doesn't. */ -static inline unsigned char +static unsigned char to_uchar (char ch) { return ch; @@ -219,7 +228,8 @@ enum EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches the empty string. */ - BACKREF, /* BACKREF is generated by \<digit>; it + BACKREF, /* BACKREF is generated by \<digit> + or by any other construct that is not completely handled. If the scanner detects a transition on backref, it returns a kind of "semi-success" indicating that @@ -402,6 +412,14 @@ struct dfa size_t nmultibyte_prop; int *multibyte_prop; +#if MBS_SUPPORT + /* A table indexed by byte values that contains the corresponding wide + character (if any) for that byte. WEOF means the byte is the + leading byte of a multibyte character. Invalid and null bytes are + mapped to themselves. */ + wint_t mbrtowc_cache[NOTCHAR]; +#endif + /* Array of the bracket expression in the DFA. */ struct mb_char_classes *mbcsets; size_t nmbcsets; @@ -504,6 +522,64 @@ static void regexp (void); } \ while (false) +static void +dfambcache (struct dfa *d) +{ +#if MBS_SUPPORT + int i; + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + char c = i; + unsigned char uc = i; + mbstate_t s = { 0 }; + wchar_t wc; + wint_t wi; + switch (mbrtowc (&wc, &c, 1, &s)) + { + default: wi = wc; break; + case (size_t) -2: wi = WEOF; break; + case (size_t) -1: wi = uc; break; + } + d->mbrtowc_cache[uc] = wi; + } +#endif +} + +#if MBS_SUPPORT +/* Given the dfa D, store into *PWC the result of converting the + leading bytes of the multibyte buffer S of length N bytes, updating + the conversion state in *MBS. On conversion error, convert just a + single byte as-is. Return the number of bytes converted. + + This differs from mbrtowc (PWC, S, N, MBS) as follows: + + * Extra arg D, containing an mbrtowc_cache for speed. + * N must be at least 1. + * S[N - 1] must be a sentinel byte. + * Shift encodings are not supported. + * The return value is always in the range 1..N. + * *MBS is always valid afterwards. + * *PWC is always set to something. */ +static size_t +mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n, + mbstate_t *mbs) +{ + unsigned char uc = s[0]; + wint_t wc = d->mbrtowc_cache[uc]; + + if (wc == WEOF) + { + size_t nbytes = mbrtowc (pwc, s, n, mbs); + if (0 < nbytes && nbytes < (size_t) -2) + return nbytes; + memset (mbs, 0, sizeof *mbs); + wc = uc; + } + + *pwc = wc; + return 1; +} +#endif #ifdef DEBUG @@ -731,67 +807,39 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) this may happen when folding case in weird Turkish locales where dotless i/dotted I are not included in the chosen character set. Return whether a bit was set in the charclass. */ -#if MBS_SUPPORT static bool setbit_wc (wint_t wc, charclass c) { +#if MBS_SUPPORT int b = wctob (wc); if (b == EOF) return false; setbit (b, c); return true; -} - -/* Set a bit in the charclass for the given single byte character, - if it is valid in the current character set. */ -static void -setbit_c (int b, charclass c) -{ - /* Do nothing if b is invalid in this character set. */ - if (MB_CUR_MAX > 1 && btowc (b) == WEOF) - return; - setbit (b, c); -} #else -# define setbit_c setbit -static inline bool -setbit_wc (wint_t wc, charclass c) -{ abort (); /*NOTREACHED*/ return false; -} #endif +} -/* Like setbit_c, but if case is folded, set both cases of a letter. For - MB_CUR_MAX > 1, the resulting charset is only used as an optimization, - and the caller takes care of setting the appropriate field of struct - mb_char_classes. */ +/* Set a bit for B and its case variants in the charclass C. + MB_CUR_MAX must be 1. */ static void setbit_case_fold_c (int b, charclass c) { - if (MB_CUR_MAX > 1) - { - wint_t wc = btowc (b); - if (wc == WEOF) - return; - setbit (b, c); - if (case_fold && iswalpha (wc)) - setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c); - } - else - { - setbit (b, c); - if (case_fold && isalpha (b)) - setbit_c (isupper (b) ? tolower (b) : toupper (b), c); - } + int ub = toupper (b); + int i; + for (i = 0; i < NOTCHAR; i++) + if (toupper (i) == ub) + setbit (i, c); } /* UTF-8 encoding allows some optimizations that we can't otherwise assume in a multibyte encoding. */ -static inline int +int using_utf8 (void) { static int utf8 = -1; @@ -811,6 +859,46 @@ using_utf8 (void) return utf8; } +/* Return true if the current locale is known to be a unibyte locale + without multicharacter collating sequences and where range + comparisons simply use the native encoding. These locales can be + processed more efficiently. */ + +static bool +using_simple_locale (void) +{ + /* True if the native character set is known to be compatible with + the C locale. The following test isn't perfect, but it's good + enough in practice, as only ASCII and EBCDIC are in common use + and this test correctly accepts ASCII and rejects EBCDIC. */ + enum { native_c_charset = + ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12 + && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35 + && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41 + && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46 + && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59 + && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65 + && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94 + && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124 + && '}' == 125 && '~' == 126) + }; + + if (! native_c_charset || MB_CUR_MAX > 1) + return false; + else + { + static int unibyte_c = -1; + if (unibyte_c < 0) + { + char const *locale = setlocale (LC_ALL, NULL); + unibyte_c = (!locale + || STREQ (locale, "C") + || STREQ (locale, "POSIX")); + } + return unibyte_c; + } +} + /* Lexical analyzer. All the dross that deals with the obnoxious GNU Regex syntax bits is located here. The poor, suffering reader is referred to the GNU Regex documentation for the @@ -827,7 +915,7 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */ static int cur_mb_len = 1; /* Length of the multibyte representation of wctok. */ /* These variables are used only if (MB_CUR_MAX > 1). */ -static mbstate_t mbs; /* Mbstate for mbrlen. */ +static mbstate_t mbs; /* mbstate for mbrtowc. */ static wchar_t wctok; /* Wide character representation of the current multibyte character. */ static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec. @@ -864,32 +952,18 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */ else \ { \ wchar_t _wc; \ - cur_mb_len = mbrtowc (&_wc, lexptr, lexleft, &mbs); \ - if (cur_mb_len <= 0) \ - { \ - cur_mb_len = 1; \ - --lexleft; \ - (wc) = (c) = to_uchar (*lexptr++); \ - } \ - else \ - { \ - lexptr += cur_mb_len; \ - lexleft -= cur_mb_len; \ - (wc) = _wc; \ - (c) = wctob (wc); \ - } \ + size_t nbytes = mbs_to_wchar (dfa, &_wc, lexptr, lexleft, &mbs); \ + cur_mb_len = nbytes; \ + (wc) = _wc; \ + (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF; \ + lexptr += nbytes; \ + lexleft -= nbytes; \ } \ } while (0) -# define FETCH(c, eoferr) \ - do { \ - wint_t wc; \ - FETCH_WC (c, wc, eoferr); \ - } while (0) - #else /* Note that characters become unsigned here. */ -# define FETCH(c, eoferr) \ +# define FETCH_WC(c, unused, eoferr) \ do { \ if (! lexleft) \ { \ @@ -902,14 +976,56 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */ --lexleft; \ } while (0) -# define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr) - #endif /* MBS_SUPPORT */ #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif +/* The set of wchar_t values C such that there's a useful locale + somewhere where C != towupper (C) && C != towlower (towupper (C)). + For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because + towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and + towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ +static short const lonesome_lower[] = + { + 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, + 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, + + /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase + counterpart in locales predating Unicode 4.0.0 (April 2003). */ + 0x03F2, + + 0x03F5, 0x1E9B, 0x1FBE, + }; + +static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2 + == CASE_FOLDED_BUFSIZE), + "CASE_FOLDED_BUFSIZE is wrong"); + +/* Find the characters equal to C after case-folding, other than C + itself, and store them into FOLDED. Return the number of characters + stored. */ +int +case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) +{ + int i; + int n = 0; + wint_t uc = towupper (c); + wint_t lc = towlower (uc); + if (uc != c) + folded[n++] = uc; + if (lc != uc && lc != c && towupper (lc) == uc) + folded[n++] = lc; + for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) + { + wint_t li = lonesome_lower[i]; + if (li != lc && li != uc && li != c && towupper (li) == uc) + folded[n++] = li; + } + return n; +} + typedef int predicate (int); /* The following list maps the names of the Posix named character classes @@ -928,7 +1044,7 @@ static const struct dfa_ctype prednames[] = { {"upper", isupper, false}, {"lower", islower, false}, {"digit", isdigit, true}, - {"xdigit", isxdigit, true}, + {"xdigit", isxdigit, false}, {"space", isspace, false}, {"punct", ispunct, false}, {"alnum", isalnum, false}, @@ -959,6 +1075,10 @@ parse_bracket_exp (void) int c, c1, c2; charclass ccl; + /* True if this is a bracket expression that dfaexec is known to + process correctly. */ + bool known_bracket_exp = true; + /* Used to warn about [:space:]. Bit 0 = first character is a colon. Bit 1 = last character is a colon. @@ -1000,6 +1120,7 @@ parse_bracket_exp (void) { FETCH_WC (c, wc, _("unbalanced [")); invert = 1; + known_bracket_exp = using_simple_locale (); } else invert = 0; @@ -1014,16 +1135,14 @@ parse_bracket_exp (void) we just treat it as a bunch of ordinary characters. We can do this because we assume regex has checked for syntax errors before dfa is ever called. */ - if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) + if (c == '[') { #define MAX_BRACKET_STRING_LEN 32 char str[MAX_BRACKET_STRING_LEN + 1]; FETCH_WC (c1, wc1, _("unbalanced [")); - /* If pattern contains '[[:', '[[.', or '[[='. */ - if (c1 == ':' - /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1. */ - || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))) + if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES)) + || c1 == '.' || c1 == '=') { size_t len = 0; for (;;) @@ -1042,7 +1161,10 @@ parse_bracket_exp (void) /* Fetch bracket. */ FETCH_WC (c, wc, _("unbalanced [")); if (c1 == ':') - /* build character class. */ + /* Build character class. POSIX allows character + classes to match multicharacter collating elements, + but the regex code does not support that, so do not + worry about that possibility. */ { char const *class = (case_fold && (STREQ (str, "upper") @@ -1064,30 +1186,11 @@ parse_bracket_exp (void) for (c2 = 0; c2 < NOTCHAR; ++c2) if (pred->func (c2)) - setbit_case_fold_c (c2, ccl); + setbit (c2, ccl); } + else + known_bracket_exp = false; - else if (MBS_SUPPORT && (c1 == '=' || c1 == '.')) - { - char *elem = xmemdup (str, len + 1); - - if (c1 == '=') - /* build equivalence class. */ - { - REALLOC_IF_NECESSARY (work_mbc->equivs, - equivs_al, work_mbc->nequivs + 1); - work_mbc->equivs[work_mbc->nequivs++] = elem; - } - - if (c1 == '.') - /* build collating element. */ - { - REALLOC_IF_NECESSARY (work_mbc->coll_elems, - coll_elems_al, - work_mbc->ncoll_elems + 1); - work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; - } - } colon_warning_state |= 8; /* Fetch new lookahead character. */ @@ -1109,105 +1212,96 @@ parse_bracket_exp (void) /* build range characters. */ { FETCH_WC (c2, wc2, _("unbalanced [")); - if (c2 == ']') + + /* A bracket expression like [a-[.aa.]] matches an unknown set. + Treat it like [-a[.aa.]] while parsing it, and + remember that the set is unknown. */ + if (c2 == '[' && *lexptr == '.') { - /* In the case [x-], the - is an ordinary hyphen, - which is left in c1, the lookahead character. */ - lexptr -= cur_mb_len; - lexleft += cur_mb_len; + known_bracket_exp = false; + c2 = ']'; } - } - if (c1 == '-' && c2 != ']') - { - if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) - FETCH_WC (c2, wc2, _("unbalanced [")); - - if (MB_CUR_MAX > 1) + if (c2 != ']') { - /* When case folding map a range, say [m-z] (or even [M-z]) - to the pair of ranges, [m-z] [M-Z]. */ - REALLOC_IF_NECESSARY (work_mbc->range_sts, - range_sts_al, work_mbc->nranges + 1); - REALLOC_IF_NECESSARY (work_mbc->range_ends, - range_ends_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = - case_fold ? towlower (wc) : (wchar_t) wc; - work_mbc->range_ends[work_mbc->nranges++] = - case_fold ? towlower (wc2) : (wchar_t) wc2; - - if (case_fold && (iswalpha (wc) || iswalpha (wc2))) + if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + FETCH_WC (c2, wc2, _("unbalanced [")); + + if (MB_CUR_MAX > 1) { + /* When case folding map a range, say [m-z] (or even [M-z]) + to the pair of ranges, [m-z] [M-Z]. Although this code + is wrong in multiple ways, it's never used in practice. + FIXME: Remove this (and related) unused code. */ REALLOC_IF_NECESSARY (work_mbc->range_sts, range_sts_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = towupper (wc); REALLOC_IF_NECESSARY (work_mbc->range_ends, range_ends_al, work_mbc->nranges + 1); - work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); - } - } - else - { -#ifdef GAWK - c1 = c; - if (case_fold) - { - c1 = tolower (c1); - c2 = tolower (c2); + work_mbc->range_sts[work_mbc->nranges] = + case_fold ? towlower (wc) : (wchar_t) wc; + work_mbc->range_ends[work_mbc->nranges++] = + case_fold ? towlower (wc2) : (wchar_t) wc2; + + if (case_fold && (iswalpha (wc) || iswalpha (wc2))) + { + REALLOC_IF_NECESSARY (work_mbc->range_sts, + range_sts_al, work_mbc->nranges + 1); + work_mbc->range_sts[work_mbc->nranges] = towupper (wc); + REALLOC_IF_NECESSARY (work_mbc->range_ends, + range_ends_al, work_mbc->nranges + 1); + work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); + } } - for (c = c1; c <= c2; c++) - setbit_case_fold_c (c, ccl); -#else - /* Defer to the system regex library about the meaning - of range expressions. */ - regex_t re; - char pattern[6] = { '[', 0, '-', 0, ']', 0 }; - char subject[2] = { 0, 0 }; - c1 = c; - if (case_fold) + else if (using_simple_locale ()) { - c1 = tolower (c1); - c2 = tolower (c2); + for (c1 = c; c1 <= c2; c1++) + setbit (c1, ccl); + if (case_fold) + { + int uc = toupper (c); + int uc2 = toupper (c2); + for (c1 = 0; c1 < NOTCHAR; c1++) + { + int uc1 = toupper (c1); + if (uc <= uc1 && uc1 <= uc2) + setbit (c1, ccl); + } + } } + else + known_bracket_exp = false; - pattern[1] = c1; - pattern[3] = c2; - regcomp (&re, pattern, REG_NOSUB); - for (c = 0; c < NOTCHAR; ++c) - { - if ((case_fold && isupper (c))) - continue; - subject[0] = c; - if (regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH) - setbit_case_fold_c (c, ccl); - } - regfree (&re); -#endif + colon_warning_state |= 8; + FETCH_WC (c1, wc1, _("unbalanced [")); + continue; } - colon_warning_state |= 8; - FETCH_WC (c1, wc1, _("unbalanced [")); - continue; + /* In the case [x-], the - is an ordinary hyphen, + which is left in c1, the lookahead character. */ + lexptr -= cur_mb_len; + lexleft += cur_mb_len; } colon_warning_state |= (c == ':') ? 2 : 4; if (MB_CUR_MAX == 1) { - setbit_case_fold_c (c, ccl); + if (case_fold) + setbit_case_fold_c (c, ccl); + else + setbit (c, ccl); continue; } - if (case_fold && iswalpha (wc)) + if (case_fold) { - wc = towlower (wc); - if (!setbit_wc (wc, ccl)) - { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = wc; - } - wc = towupper (wc); + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wc, folded); + REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, + work_mbc->nchars + n); + for (i = 0; i < n; i++) + if (!setbit_wc (folded[i], ccl)) + work_mbc->chars[work_mbc->nchars++] = folded[i]; } if (!setbit_wc (wc, ccl)) { @@ -1221,6 +1315,9 @@ parse_bracket_exp (void) if (colon_warning_state == 7) dfawarn (_("character class syntax is [[:space:]], not [:space:]")); + if (! known_bracket_exp) + return BACKREF; + if (MB_CUR_MAX > 1) { static charclass zeroclass; @@ -1256,14 +1353,9 @@ lex (void) "if (backslash) ...". */ for (i = 0; i < 2; ++i) { - if (MB_CUR_MAX > 1) - { - FETCH_WC (c, wctok, NULL); - if ((int) c == EOF) - goto normal_char; - } - else - FETCH (c, NULL); + FETCH_WC (c, wctok, NULL); + if (c == (unsigned int) EOF) + goto normal_char; switch (c) { @@ -1638,10 +1730,11 @@ addtok (token t) work_mbc->nchars = 0; } - /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */ + /* If the MBCSET is non-inverted and doesn't include neither + character classes including multibyte characters, range + expressions, equivalence classes nor collating elements, + it can be replaced to a simple CSET. */ if (work_mbc->invert - || (!using_utf8 () && work_mbc->cset != -1) - || work_mbc->nchars != 0 || work_mbc->nch_classes != 0 || work_mbc->nranges != 0 || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0) @@ -1656,7 +1749,6 @@ addtok (token t) that the mbcset is empty now. Do nothing in that case. */ if (work_mbc->cset != -1) { - assert (using_utf8 ()); addtok (CSET + work_mbc->cset); if (need_or) addtok (OR); @@ -1680,16 +1772,19 @@ static void addtok_wc (wint_t wc) { unsigned char buf[MB_LEN_MAX]; - mbstate_t s; + mbstate_t s = { 0 }; int i; - memset (&s, 0, sizeof s); - cur_mb_len = wcrtomb ((char *) buf, wc, &s); + size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); - /* This is merely stop-gap. When cur_mb_len is 0 or negative, - buf[0] is undefined, yet skipping the addtok_mb call altogether - can result in heap corruption. */ - if (cur_mb_len <= 0) - buf[0] = 0; + if (stored_bytes != (size_t) -1) + cur_mb_len = stored_bytes; + else + { + /* This is merely stop-gap. buf[0] is undefined, yet skipping + the addtok_mb call altogether can corrupt the heap. */ + cur_mb_len = 1; + buf[0] = 0; + } addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1); for (i = 1; i < cur_mb_len; i++) @@ -1794,17 +1889,19 @@ add_utf8_anychar (void) static void atom (void) { - if (0) - { - /* empty */ - } - else if (MBS_SUPPORT && tok == WCHAR) + if (MBS_SUPPORT && tok == WCHAR) { - addtok_wc (case_fold ? towlower (wctok) : wctok); - if (case_fold && iswalpha (wctok)) + addtok_wc (wctok); + + if (case_fold) { - addtok_wc (towupper (wctok)); - addtok (OR); + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wctok, folded); + for (i = 0; i < n; i++) + { + addtok_wc (folded[i]); + addtok (OR); + } } tok = lex (); @@ -3308,43 +3405,26 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) /* Initialize mblen_buf and inputwcs with data from the next line. */ static void -prepare_wc_buf (const char *begin, const char *end) +prepare_wc_buf (struct dfa *d, const char *begin, const char *end) { #if MBS_SUPPORT unsigned char eol = eolbyte; - size_t remain_bytes, i; + size_t i; + size_t ilim = end - begin + 1; buf_begin = (unsigned char *) begin; - remain_bytes = 0; - for (i = 0; i < end - begin + 1; i++) + for (i = 0; i < ilim; i++) { - if (remain_bytes == 0) - { - remain_bytes - = mbrtowc (inputwcs + i, begin + i, end - begin - i + 1, &mbs); - if (remain_bytes < 1 - || remain_bytes == (size_t) -1 - || remain_bytes == (size_t) -2 - || (remain_bytes == 1 && inputwcs[i] == (wchar_t) begin[i])) - { - remain_bytes = 0; - inputwcs[i] = (wchar_t) begin[i]; - mblen_buf[i] = 0; - if (begin[i] == eol) - break; - } - else - { - mblen_buf[i] = remain_bytes; - remain_bytes--; - } - } - else + size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i, &mbs); + mblen_buf[i] = nbytes - (nbytes == 1); + if (begin[i] == eol) + break; + while (--nbytes != 0) { - mblen_buf[i] = remain_bytes; + i++; + mblen_buf[i] = nbytes; inputwcs[i] = 0; - remain_bytes--; } } @@ -3391,7 +3471,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, MALLOC (mblen_buf, end - begin + 2); MALLOC (inputwcs, end - begin + 2); memset (&mbs, 0, sizeof (mbstate_t)); - prepare_wc_buf ((const char *) p, end); + prepare_wc_buf (d, (const char *) p, end); } for (;;) @@ -3481,7 +3561,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, ++*count; if (d->mb_cur_max > 1) - prepare_wc_buf ((const char *) p, end); + prepare_wc_buf (d, (const char *) p, end); } /* Check if we've run off the end of the buffer. */ @@ -3600,6 +3680,7 @@ void dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) { dfainit (d); + dfambcache (d); dfaparse (s, len, d); dfamust (d); dfaoptimize (d); |