diff options
Diffstat (limited to 'dfa.c')
-rw-r--r-- | dfa.c | 1033 |
1 files changed, 448 insertions, 585 deletions
@@ -59,7 +59,6 @@ #define _(str) gettext (str) #include <wchar.h> -#include <wctype.h> #include "xalloc.h" @@ -69,6 +68,8 @@ #include "dfa.h" +#include "localeinfo.h" + #ifdef GAWK static int is_blank (int c) @@ -77,14 +78,6 @@ is_blank (int c) } #endif /* GAWK */ -#ifdef LIBC_IS_BORKED -extern int gawk_mb_cur_max; -#undef MB_CUR_MAX -#define MB_CUR_MAX gawk_mb_cur_max -#undef mbrtowc -#define mbrtowc(a, b, c, d) (-1) -#endif - /* HPUX defines these as macros in sys/param.h. */ #ifdef setbit # undef setbit @@ -337,9 +330,6 @@ typedef struct size_t hash; /* Hash of the positions of this state. */ position_set elems; /* Positions this state could match. */ unsigned char context; /* Context from previous state. */ - bool curr_dependent; /* True if the follows of any positions with - ANYCHAR depends on the next character's - context. */ unsigned short constraint; /* Constraint for this state to accept. */ token first_end; /* Token value of the first END in elems. */ position_set mbps; /* Positions which can match multibyte @@ -372,6 +362,10 @@ struct regex_syntax /* Flag for case-folding letters into sets. */ bool case_fold; + /* True if ^ and $ match only the start and end of data, and do not match + end-of-line within data. */ + bool anchor; + /* End-of-line byte in data. */ unsigned char eolbyte; @@ -395,8 +389,8 @@ struct regex_syntax meaning of the @#%!@#%^!@ syntax bits. */ struct lexer_state { - char const *lexptr; /* Pointer to next input character. */ - size_t lexleft; /* Number of characters remaining. */ + char const *ptr; /* Pointer to next input character. */ + size_t left; /* Number of characters remaining. */ token lasttok; /* Previous token returned; initially END. */ size_t parens; /* Count of outstanding left parens. */ int minrep, maxrep; /* Repeat counts for {m,n}. */ @@ -435,12 +429,13 @@ struct dfa charclass *charclasses; /* Array of character sets for CSET tokens. */ size_t cindex; /* Index for adding new charclasses. */ size_t calloc; /* Number of charclasses allocated. */ + size_t canychar; /* Index of anychar class, or (size_t) -1. */ /* Scanner state */ - struct lexer_state lexstate; + struct lexer_state lex; /* Parser state */ - struct parser_state parsestate; + struct parser_state parse; /* Fields filled by the parser. */ token *tokens; /* Postfix parse array. */ @@ -453,14 +448,9 @@ struct dfa size_t nregexps; /* Count of parallel regexps being built with dfaparse. */ bool fast; /* The DFA is fast. */ - bool multibyte; /* MB_CUR_MAX > 1. */ token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ mbstate_t mbs; /* Multibyte conversion state. */ - /* dfaexec implementation. */ - char *(*dfaexec) (struct dfa *, char const *, char *, - bool, size_t *, bool *); - /* The following are valid only if MB_CUR_MAX > 1. */ /* The value of multibyte_prop[i] is defined by following rule. @@ -546,6 +536,21 @@ struct dfa state_num **mb_trans; /* Transition tables for states with ANYCHAR. */ state_num mb_trcount; /* Number of transition tables for states with ANYCHAR that have actually been built. */ + + /* Information derived from the locale. This is at the end so that + a quick memset need not clear it specially. */ + + /* dfaexec implementation. */ + char *(*dfaexec) (struct dfa *, char const *, char *, + bool, size_t *, bool *); + + /* The locale is simple, like the C locale. These locales can be + processed more efficiently, e.g., the relationship between lower- + and upper-case letters is 1-1. */ + bool simple_locale; + + /* Other cached information derived from the locale. */ + struct localeinfo localeinfo; }; /* Some macros for user access to dfa internals. */ @@ -559,13 +564,8 @@ struct dfa static void regexp (struct dfa *dfa); -/* A table indexed by byte values that contains the corresponding wide - character (if any) for that byte. WEOF means the byte is not a - valid single-byte character. */ -static wint_t mbrtowc_cache[NOTCHAR]; - /* Store into *PWC the result of converting the leading bytes of the - multibyte buffer S of length N bytes, using the mbrtowc_cache in *D + multibyte buffer S of length N bytes, using D->localeinfo.sbctowc and updating the conversion state in *D. On conversion error, convert just a single byte, to WEOF. Return the number of bytes converted. @@ -574,7 +574,7 @@ static wint_t mbrtowc_cache[NOTCHAR]; * PWC points to wint_t, not to wchar_t. * The last arg is a dfa *D instead of merely a multibyte conversion - state D->mbs. D also contains an mbrtowc_cache for speed. + state D->mbs. * N must be at least 1. * S[N - 1] must be a sentinel byte. * Shift encodings are not supported. @@ -585,7 +585,7 @@ static size_t mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) { unsigned char uc = s[0]; - wint_t wc = mbrtowc_cache[uc]; + wint_t wc = d->localeinfo.sbctowc[uc]; if (wc == WEOF) { @@ -762,7 +762,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) /* In DFA D, find the index of charclass S, or allocate a new one. */ static size_t -dfa_charclass_index (struct dfa *d, charclass const s) +charclass_index (struct dfa *d, charclass const s) { size_t i; @@ -777,78 +777,21 @@ dfa_charclass_index (struct dfa *d, charclass const s) } static bool -unibyte_word_constituent (unsigned char c) +unibyte_word_constituent (struct dfa const *dfa, unsigned char c) { - return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_'); + return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_'); } static int char_context (struct dfa const *dfa, unsigned char c) { - if (c == dfa->syntax.eolbyte) + if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor) return CTX_NEWLINE; - if (unibyte_word_constituent (c)) + if (unibyte_word_constituent (dfa, c)) return CTX_LETTER; return CTX_NONE; } -/* UTF-8 encoding allows some optimizations that we can't otherwise - assume in a multibyte encoding. */ -static bool using_utf8; - -bool -dfa_using_utf8 (void) -{ - return using_utf8; -} - -static void -init_mbrtowc_cache (void) -{ - int i; - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - char c = i; - unsigned char uc = i; - mbstate_t s = { 0 }; - wchar_t wc; - mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; - } -} - -/* Entry point to set syntax options. */ -void -dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol) -{ - int i; - dfa->syntax.syntax_bits_set = true; - dfa->syntax.syntax_bits = bits; - dfa->syntax.case_fold = fold; - dfa->syntax.eolbyte = eol; - - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - unsigned char uc = i; - - /* Use mbrtowc_cache to calculate sbit. */ - dfa->syntax.sbit[uc] = char_context (dfa, uc); - switch (dfa->syntax.sbit[uc]) - { - case CTX_LETTER: - setbit (uc, dfa->syntax.letters); - break; - case CTX_NEWLINE: - setbit (uc, dfa->syntax.newline); - break; - } - - /* POSIX requires that the five bytes in "\n\r./" (including the - terminating NUL) cannot occur inside a multibyte character. */ - dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80 - : strchr ("\n\r./", uc) != NULL); - } -} - /* Set a bit in the charclass for the given wchar_t. Do nothing if WC is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, this may happen when folding case in weird Turkish locales where @@ -877,30 +820,10 @@ setbit_case_fold_c (int b, charclass c) setbit (i, c); } -static void check_utf8 (void) -{ - wchar_t wc; - mbstate_t mbs = { 0 }; - using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; -} - -static bool unibyte_c; - -static void check_unibyte_c (void) -{ - char const *locale = setlocale (LC_ALL, NULL); - unibyte_c = (!locale - || STREQ (locale, "C") - || STREQ (locale, "POSIX")); -} - -/* The current locale is known to be a unibyte locale - without multicharacter collating sequences and where range - comparisons simply use the native encoding. These locales can be - processed more efficiently. */ +/* Return true if the locale compatible with the C locale. */ static bool -using_simple_locale (struct dfa const *dfa) +using_simple_locale (bool multibyte) { /* The native character set is known to be compatible with the C locale. The following test isn't perfect, but it's good @@ -918,7 +841,15 @@ using_simple_locale (struct dfa const *dfa) && '}' == 125 && '~' == 126) }; - return (!native_c_charset || dfa->multibyte) ? false : unibyte_c; + if (native_c_charset && !multibyte) + return true; + else + { + /* Treat C and POSIX locales as being compatible. Also, treat + errors as compatible, as these are invariably from stubs. */ + char const *loc = setlocale (LC_ALL, NULL); + return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX"); + } } /* Fetch the next lexical input character. Set C (of type int) to the @@ -930,23 +861,23 @@ using_simple_locale (struct dfa const *dfa) otherwise. */ # define FETCH_WC(dfa, c, wc, eoferr) \ do { \ - if (! dfa->lexstate.lexleft) \ + if (! (dfa)->lex.left) \ { \ if ((eoferr) != 0) \ dfaerror (eoferr); \ else \ - return dfa->lexstate.lasttok = END; \ + return (dfa)->lex.lasttok = END; \ } \ else \ { \ wint_t _wc; \ - size_t nbytes = mbs_to_wchar (&_wc, dfa->lexstate.lexptr, \ - dfa->lexstate.lexleft, dfa); \ - dfa->lexstate.cur_mb_len = nbytes; \ + size_t nbytes = mbs_to_wchar (&_wc, (dfa)->lex.ptr, \ + (dfa)->lex.left, dfa); \ + (dfa)->lex.cur_mb_len = nbytes; \ (wc) = _wc; \ - (c) = nbytes == 1 ? to_uchar (*dfa->lexstate.lexptr) : EOF; \ - dfa->lexstate.lexptr += nbytes; \ - dfa->lexstate.lexleft -= nbytes; \ + (c) = nbytes == 1 ? to_uchar ((dfa)->lex.ptr[0]) : EOF; \ + (dfa)->lex.ptr += nbytes; \ + (dfa)->lex.left -= nbytes; \ } \ } while (false) @@ -954,53 +885,6 @@ using_simple_locale (struct dfa const *dfa) # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif -/* The set of wchar_t values C such that there's a useful locale - somewhere where C != towupper (C) && C != towlower (towupper (C)). - For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because - towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and - towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ -static short const lonesome_lower[] = - { - 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, - 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, - - /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase - counterpart in locales predating Unicode 4.0.0 (April 2003). */ - 0x03F2, - - 0x03F5, 0x1E9B, 0x1FBE, - }; - -/* Maximum number of characters that can be the case-folded - counterparts of a single character, not counting the character - itself. This is 1 for towupper, 1 for towlower, and 1 for each - entry in LONESOME_LOWER. */ -enum -{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower }; - -/* Find the characters equal to C after case-folding, other than C - itself, and store them into FOLDED. Return the number of characters - stored. */ -static unsigned int -case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) -{ - unsigned int i; - unsigned int n = 0; - wint_t uc = towupper (c); - wint_t lc = towlower (uc); - if (uc != c) - folded[n++] = uc; - if (lc != uc && lc != c && towupper (lc) == uc) - folded[n++] = lc; - for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) - { - wint_t li = lonesome_lower[i]; - if (li != lc && li != uc && li != c && towupper (li) == uc) - folded[n++] = li; - } - return n; -} - typedef int predicate (int); /* The following list maps the names of the Posix named character classes @@ -1069,7 +953,7 @@ parse_bracket_exp (struct dfa *dfa) size_t chars_al; chars_al = 0; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets, &dfa->mbcsets_alloc, @@ -1092,7 +976,7 @@ parse_bracket_exp (struct dfa *dfa) { FETCH_WC (dfa, c, wc, _("unbalanced [")); invert = true; - known_bracket_exp = using_simple_locale (dfa); + known_bracket_exp = dfa->simple_locale; } else invert = false; @@ -1120,8 +1004,8 @@ parse_bracket_exp (struct dfa *dfa) for (;;) { FETCH_WC (dfa, c, wc, _("unbalanced [")); - if ((c == c1 && *dfa->lexstate.lexptr == ']') - || dfa->lexstate.lexleft == 0) + if (dfa->lex.left == 0 + || (c == c1 && dfa->lex.ptr[0] == ']')) break; if (len < MAX_BRACKET_STRING_LEN) str[len++] = c; @@ -1141,13 +1025,13 @@ parse_bracket_exp (struct dfa *dfa) { char const *class = (dfa->syntax.case_fold && (STREQ (str, "upper") - || STREQ (str, "lower")) ? - "alpha" : str); + || STREQ (str, "lower")) + ? "alpha" : str); const struct dfa_ctype *pred = find_pred (class); if (!pred) dfaerror (_("invalid character class")); - if (dfa->multibyte && !pred->single_byte_only) + if (dfa->localeinfo.multibyte && !pred->single_byte_only) known_bracket_exp = false; else for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1182,7 +1066,7 @@ parse_bracket_exp (struct dfa *dfa) /* A bracket expression like [a-[.aa.]] matches an unknown set. Treat it like [-a[.aa.]] while parsing it, and remember that the set is unknown. */ - if (c2 == '[' && *dfa->lexstate.lexptr == '.') + if (c2 == '[' && dfa->lex.ptr[0] == '.') { known_bracket_exp = false; c2 = ']'; @@ -1192,8 +1076,8 @@ parse_bracket_exp (struct dfa *dfa) { /* In the case [x-], the - is an ordinary hyphen, which is left in c1, the lookahead character. */ - dfa->lexstate.lexptr -= dfa->lexstate.cur_mb_len; - dfa->lexstate.lexleft += dfa->lexstate.cur_mb_len; + dfa->lex.ptr -= dfa->lex.cur_mb_len; + dfa->lex.left += dfa->lex.cur_mb_len; } else { @@ -1207,9 +1091,9 @@ parse_bracket_exp (struct dfa *dfa) /* Treat [x-y] as a range if x != y. */ if (wc != wc2 || wc == WEOF) { - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) known_bracket_exp = false; - else if (using_simple_locale (dfa)) + else if (dfa->simple_locale) { int ci; for (ci = c; ci <= c2; ci++) @@ -1236,7 +1120,7 @@ parse_bracket_exp (struct dfa *dfa) colon_warning_state |= (c == ':') ? 2 : 4; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { if (dfa->syntax.case_fold) setbit_case_fold_c (c, ccl); @@ -1273,37 +1157,45 @@ parse_bracket_exp (struct dfa *dfa) if (! known_bracket_exp) return BACKREF; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { work_mbc->invert = invert; - work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl); + work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl); return MBCSET; } if (invert) { - assert (!dfa->multibyte); + assert (!dfa->localeinfo.multibyte); notset (ccl); if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit ('\n', ccl); } - return CSET + dfa_charclass_index (dfa, ccl); + return CSET + charclass_index (dfa, ccl); } -#define PUSH_LEX_STATE(s) \ - do \ - { \ - char const *lexptr_saved = dfa->lexstate.lexptr; \ - size_t lexleft_saved = dfa->lexstate.lexleft; \ - dfa->lexstate.lexptr = (s); \ - dfa->lexstate.lexleft = strlen (dfa->lexstate.lexptr) +struct lexptr +{ + char const *ptr; + size_t left; +}; -#define POP_LEX_STATE() \ - dfa->lexstate.lexptr = lexptr_saved; \ - dfa->lexstate.lexleft = lexleft_saved; \ - } \ - while (false) +static void +push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s) +{ + ls->ptr = dfa->lex.ptr; + ls->left = dfa->lex.left; + dfa->lex.ptr = s; + dfa->lex.left = strlen (s); +} + +static void +pop_lex_state (struct dfa *dfa, struct lexptr const *ls) +{ + dfa->lex.ptr = ls->ptr; + dfa->lex.left = ls->left; +} static token lex (struct dfa *dfa) @@ -1321,14 +1213,14 @@ lex (struct dfa *dfa) "if (backslash) ...". */ for (i = 0; i < 2; ++i) { - FETCH_WC (dfa, c, dfa->lexstate.wctok, NULL); + FETCH_WC (dfa, c, dfa->lex.wctok, NULL); switch (c) { case '\\': if (backslash) goto normal_char; - if (dfa->lexstate.lexleft == 0) + if (dfa->lex.left == 0) dfaerror (_("unfinished \\ escape")); backslash = true; break; @@ -1337,28 +1229,29 @@ lex (struct dfa *dfa) if (backslash) goto normal_char; if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS - || dfa->lexstate.lasttok == END || dfa->lexstate.lasttok == LPAREN - || dfa->lexstate.lasttok == OR) - return dfa->lexstate.lasttok = BEGLINE; + || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN + || dfa->lex.lasttok == OR) + return dfa->lex.lasttok = BEGLINE; goto normal_char; case '$': if (backslash) goto normal_char; if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS - || dfa->lexstate.lexleft == 0 - || (dfa->syntax.syntax_bits & RE_NO_BK_PARENS - ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == ')' - : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\' - && dfa->lexstate.lexptr[1] == ')') - || (dfa->syntax.syntax_bits & RE_NO_BK_VBAR - ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == '|' - : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\' - && dfa->lexstate.lexptr[1] == '|') + || dfa->lex.left == 0 + || ((dfa->lex.left + > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)) + && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) + & (dfa->lex.ptr[0] == '\\')] + == ')')) + || ((dfa->lex.left + > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)) + && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR) + & (dfa->lex.ptr[0] == '\\')] + == '|')) || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT) - && dfa->lexstate.lexleft > 0 - && *dfa->lexstate.lexptr == '\n')) - return dfa->lexstate.lasttok = ENDLINE; + && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n')) + return dfa->lex.lasttok = ENDLINE; goto normal_char; case '1': @@ -1372,8 +1265,8 @@ lex (struct dfa *dfa) case '9': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS)) { - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = BACKREF; + dfa->lex.laststart = false; + return dfa->lex.lasttok = BACKREF; } goto normal_char; @@ -1381,7 +1274,7 @@ lex (struct dfa *dfa) if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) { /* FIXME: should be beginning of string */ - return dfa->lexstate.lasttok = BEGLINE; + return dfa->lex.lasttok = BEGLINE; } goto normal_char; @@ -1389,28 +1282,28 @@ lex (struct dfa *dfa) if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) { /* FIXME: should be end of string */ - return dfa->lexstate.lasttok = ENDLINE; + return dfa->lex.lasttok = ENDLINE; } goto normal_char; case '<': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = BEGWORD; + return dfa->lex.lasttok = BEGWORD; goto normal_char; case '>': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = ENDWORD; + return dfa->lex.lasttok = ENDWORD; goto normal_char; case 'b': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = LIMWORD; + return dfa->lex.lasttok = LIMWORD; goto normal_char; case 'B': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = NOTLIMWORD; + return dfa->lex.lasttok = NOTLIMWORD; goto normal_char; case '?': @@ -1419,17 +1312,17 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = QMARK; + return dfa->lex.lasttok = QMARK; case '*': if (backslash) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = STAR; + return dfa->lex.lasttok = STAR; case '+': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) @@ -1437,9 +1330,9 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = PLUS; + return dfa->lex.lasttok = PLUS; case '{': if (!(dfa->syntax.syntax_bits & RE_INTERVALS)) @@ -1447,7 +1340,7 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; /* Cases: @@ -1457,111 +1350,107 @@ lex (struct dfa *dfa) {,} - 0 to infinity (same as '*') {M,N} - M through N */ { - char const *p = dfa->lexstate.lexptr; - char const *lim = p + dfa->lexstate.lexleft; - dfa->lexstate.minrep = dfa->lexstate.maxrep = -1; + char const *p = dfa->lex.ptr; + char const *lim = p + dfa->lex.left; + dfa->lex.minrep = dfa->lex.maxrep = -1; for (; p != lim && ISASCIIDIGIT (*p); p++) - { - if (dfa->lexstate.minrep < 0) - dfa->lexstate.minrep = *p - '0'; - else - dfa->lexstate.minrep = MIN (RE_DUP_MAX + 1, - (dfa->lexstate.minrep - * 10 + *p - '0')); - } + dfa->lex.minrep = (dfa->lex.minrep < 0 + ? *p - '0' + : MIN (RE_DUP_MAX + 1, + dfa->lex.minrep * 10 + *p - '0')); if (p != lim) { if (*p != ',') - dfa->lexstate.maxrep = dfa->lexstate.minrep; + dfa->lex.maxrep = dfa->lex.minrep; else { - if (dfa->lexstate.minrep < 0) - dfa->lexstate.minrep = 0; + if (dfa->lex.minrep < 0) + dfa->lex.minrep = 0; while (++p != lim && ISASCIIDIGIT (*p)) - { - if (dfa->lexstate.maxrep < 0) - dfa->lexstate.maxrep = *p - '0'; - else - dfa->lexstate.maxrep = MIN (RE_DUP_MAX + 1, - (dfa->lexstate.maxrep - * 10 + *p - '0')); - } + dfa->lex.maxrep + = (dfa->lex.maxrep < 0 + ? *p - '0' + : MIN (RE_DUP_MAX + 1, + dfa->lex.maxrep * 10 + *p - '0')); } } if (! ((! backslash || (p != lim && *p++ == '\\')) && p != lim && *p++ == '}' - && 0 <= dfa->lexstate.minrep - && (dfa->lexstate.maxrep < 0 - || dfa->lexstate.minrep <= dfa->lexstate.maxrep))) + && 0 <= dfa->lex.minrep + && (dfa->lex.maxrep < 0 + || dfa->lex.minrep <= dfa->lex.maxrep))) { if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD) goto normal_char; dfaerror (_("invalid content of \\{\\}")); } - if (RE_DUP_MAX < dfa->lexstate.maxrep) + if (RE_DUP_MAX < dfa->lex.maxrep) dfaerror (_("regular expression too big")); - dfa->lexstate.lexptr = p; - dfa->lexstate.lexleft = lim - p; + dfa->lex.ptr = p; + dfa->lex.left = lim - p; } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = REPMN; + dfa->lex.laststart = false; + return dfa->lex.lasttok = REPMN; case '|': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0)) goto normal_char; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = OR; + dfa->lex.laststart = true; + return dfa->lex.lasttok = OR; case '\n': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT)) goto normal_char; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = OR; + dfa->lex.laststart = true; + return dfa->lex.lasttok = OR; case '(': if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; - ++dfa->lexstate.parens; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = LPAREN; + dfa->lex.parens++; + dfa->lex.laststart = true; + return dfa->lex.lasttok = LPAREN; case ')': if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; - if (dfa->lexstate.parens == 0 + if (dfa->lex.parens == 0 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; - --dfa->lexstate.parens; - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = RPAREN; + dfa->lex.parens--; + dfa->lex.laststart = false; + return dfa->lex.lasttok = RPAREN; case '.': if (backslash) goto normal_char; - if (dfa->multibyte) + if (dfa->canychar == (size_t) -1) { - /* In multibyte environment period must match with a single - character not a byte. So we use ANYCHAR. */ - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = ANYCHAR; + zeroset (ccl); + notset (ccl); + if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE)) + clrbit ('\n', ccl); + if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) + clrbit ('\0', ccl); + if (dfa->localeinfo.multibyte) + for (c2 = 0; c2 < NOTCHAR; c2++) + if (dfa->localeinfo.sbctowc[c2] == WEOF) + clrbit (c2, ccl); + dfa->canychar = charclass_index (dfa, ccl); } - zeroset (ccl); - notset (ccl); - if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE)) - clrbit ('\n', ccl); - if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) - clrbit ('\0', ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = (dfa->localeinfo.multibyte + ? ANYCHAR + : CSET + dfa->canychar); case 's': case 'S': if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1569,9 +1458,8 @@ lex (struct dfa *dfa) setbit (c2, ccl); if (c == 'S') notset (ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1580,31 +1468,31 @@ lex (struct dfa *dfa) /* \s and \S are documented to be equivalent to [[:space:]] and [^[:space:]] respectively, so tell the lexer to process those strings, each minus its "already processed" '['. */ - PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]"); - - dfa->lexstate.lasttok = parse_bracket_exp (dfa); - - POP_LEX_STATE (); + { + struct lexptr ls; + push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']); + dfa->lex.lasttok = parse_bracket_exp (dfa); + pop_lex_state (dfa, &ls); + } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok; + dfa->lex.laststart = false; + return dfa->lex.lasttok; case 'w': case 'W': if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) - if (unibyte_word_constituent (c2)) + if (dfa->syntax.sbit[c2] == CTX_LETTER) setbit (c2, ccl); if (c == 'W') notset (ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1613,38 +1501,38 @@ lex (struct dfa *dfa) /* \w and \W are documented to be equivalent to [_[:alnum:]] and [^_[:alnum:]] respectively, so tell the lexer to process those strings, each minus its "already processed" '['. */ - PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]"); - - dfa->lexstate.lasttok = parse_bracket_exp (dfa); - - POP_LEX_STATE (); + { + struct lexptr ls; + push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']); + dfa->lex.lasttok = parse_bracket_exp (dfa); + pop_lex_state (dfa, &ls); + } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok; + dfa->lex.laststart = false; + return dfa->lex.lasttok; case '[': if (backslash) goto normal_char; - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = parse_bracket_exp (dfa); + dfa->lex.laststart = false; + return dfa->lex.lasttok = parse_bracket_exp (dfa); default: normal_char: - dfa->lexstate.laststart = false; + dfa->lex.laststart = false; /* For multibyte character sets, folding is done in atom. Always return WCHAR. */ - if (dfa->multibyte) - return dfa->lexstate.lasttok = WCHAR; + if (dfa->localeinfo.multibyte) + return dfa->lex.lasttok = WCHAR; if (dfa->syntax.case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } - return dfa->lexstate.lasttok = c; + return dfa->lex.lasttok = c; } } @@ -1661,11 +1549,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop) { dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc, sizeof *dfa->tokens); - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc, sizeof *dfa->multibyte_prop); } - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop[dfa->tindex] = mbprop; dfa->tokens[dfa->tindex++] = t; @@ -1678,21 +1566,21 @@ addtok_mb (struct dfa *dfa, token t, int mbprop) case CAT: case OR: - --dfa->parsestate.depth; + dfa->parse.depth--; break; case BACKREF: dfa->fast = false; /* fallthrough */ default: - ++dfa->nleaves; + dfa->nleaves++; /* fallthrough */ case EMPTY: - ++dfa->parsestate.depth; + dfa->parse.depth++; break; } - if (dfa->parsestate.depth > dfa->depth) - dfa->depth = dfa->parsestate.depth; + if (dfa->parse.depth > dfa->depth) + dfa->depth = dfa->parse.depth; } static void addtok_wc (struct dfa *dfa, wint_t wc); @@ -1702,7 +1590,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc); static void addtok (struct dfa *dfa, token t) { - if (dfa->multibyte && t == MBCSET) + if (dfa->localeinfo.multibyte && t == MBCSET) { bool need_or = false; struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1]; @@ -1749,19 +1637,19 @@ addtok_wc (struct dfa *dfa, wint_t wc) size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); if (stored_bytes != (size_t) -1) - dfa->lexstate.cur_mb_len = stored_bytes; + dfa->lex.cur_mb_len = stored_bytes; else { /* This is merely stop-gap. buf[0] is undefined, yet skipping the addtok_mb call altogether can corrupt the heap. */ - dfa->lexstate.cur_mb_len = 1; + dfa->lex.cur_mb_len = 1; buf[0] = 0; } - addtok_mb (dfa, buf[0], dfa->lexstate.cur_mb_len == 1 ? 3 : 1); - for (i = 1; i < dfa->lexstate.cur_mb_len; i++) + addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1); + for (i = 1; i < dfa->lex.cur_mb_len; i++) { - addtok_mb (dfa, buf[i], i == dfa->lexstate.cur_mb_len - 1 ? 2 : 0); + addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0); addtok (dfa, CAT); } } @@ -1801,7 +1689,7 @@ add_utf8_anychar (struct dfa *dfa) if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', c); } - dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c); + dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c); } /* A valid UTF-8 character is @@ -1862,18 +1750,18 @@ add_utf8_anychar (struct dfa *dfa) static void atom (struct dfa *dfa) { - if (dfa->parsestate.tok == WCHAR) + if (dfa->parse.tok == WCHAR) { - if (dfa->lexstate.wctok == WEOF) + if (dfa->lex.wctok == WEOF) addtok (dfa, BACKREF); else { - addtok_wc (dfa, dfa->lexstate.wctok); + addtok_wc (dfa, dfa->lex.wctok); if (dfa->syntax.case_fold) { wchar_t folded[CASE_FOLDED_BUFSIZE]; - unsigned int i, n = case_folded_counterparts (dfa->lexstate.wctok, + unsigned int i, n = case_folded_counterparts (dfa->lex.wctok, folded); for (i = 0; i < n; i++) { @@ -1883,9 +1771,9 @@ atom (struct dfa *dfa) } } - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == ANYCHAR && using_utf8) + else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1895,26 +1783,25 @@ atom (struct dfa *dfa) UTF-8: it is the most used, and the structure of the encoding makes the correctness more obvious. */ add_utf8_anychar (dfa); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if ((dfa->parsestate.tok >= 0 && dfa->parsestate.tok < NOTCHAR) - || dfa->parsestate.tok >= CSET || dfa->parsestate.tok == BACKREF - || dfa->parsestate.tok == BEGLINE || dfa->parsestate.tok == ENDLINE - || dfa->parsestate.tok == BEGWORD || dfa->parsestate.tok == ANYCHAR - || dfa->parsestate.tok == MBCSET || dfa->parsestate.tok == ENDWORD - || dfa->parsestate.tok == LIMWORD - || dfa->parsestate.tok == NOTLIMWORD) + else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR) + || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF + || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE + || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR + || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD + || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD) { - addtok (dfa, dfa->parsestate.tok); - dfa->parsestate.tok = lex (dfa); + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == LPAREN) + else if (dfa->parse.tok == LPAREN) { - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); regexp (dfa); - if (dfa->parsestate.tok != RPAREN) + if (dfa->parse.tok != RPAREN) dfaerror (_("unbalanced (")); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } else addtok (dfa, EMPTY); @@ -1947,7 +1834,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens) { size_t i; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) for (i = 0; i < ntokens; ++i) addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]); else @@ -1962,40 +1849,39 @@ closure (struct dfa *dfa) size_t tindex, ntokens; atom (dfa); - while (dfa->parsestate.tok == QMARK || dfa->parsestate.tok == STAR - || dfa->parsestate.tok == PLUS || dfa->parsestate.tok == REPMN) - if (dfa->parsestate.tok == REPMN - && (dfa->lexstate.minrep || dfa->lexstate.maxrep)) + while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR + || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN) + if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep)) { ntokens = nsubtoks (dfa, dfa->tindex); tindex = dfa->tindex - ntokens; - if (dfa->lexstate.maxrep < 0) + if (dfa->lex.maxrep < 0) addtok (dfa, PLUS); - if (dfa->lexstate.minrep == 0) + if (dfa->lex.minrep == 0) addtok (dfa, QMARK); - for (i = 1; i < dfa->lexstate.minrep; ++i) + for (i = 1; i < dfa->lex.minrep; i++) { copytoks (dfa, tindex, ntokens); addtok (dfa, CAT); } - for (; i < dfa->lexstate.maxrep; ++i) + for (; i < dfa->lex.maxrep; i++) { copytoks (dfa, tindex, ntokens); addtok (dfa, QMARK); addtok (dfa, CAT); } - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == REPMN) + else if (dfa->parse.tok == REPMN) { dfa->tindex -= nsubtoks (dfa, dfa->tindex); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); closure (dfa); } else { - addtok (dfa, dfa->parsestate.tok); - dfa->parsestate.tok = lex (dfa); + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); } } @@ -2003,8 +1889,8 @@ static void branch (struct dfa* dfa) { closure (dfa); - while (dfa->parsestate.tok != RPAREN && dfa->parsestate.tok != OR - && dfa->parsestate.tok >= 0) + while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR + && dfa->parse.tok >= 0) { closure (dfa); addtok (dfa, CAT); @@ -2015,9 +1901,9 @@ static void regexp (struct dfa *dfa) { branch (dfa); - while (dfa->parsestate.tok == OR) + while (dfa->parse.tok == OR) { - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); branch (dfa); addtok (dfa, OR); } @@ -2029,26 +1915,26 @@ regexp (struct dfa *dfa) static void dfaparse (char const *s, size_t len, struct dfa *d) { - d->lexstate.lexptr = s; - d->lexstate.lexleft = len; - d->lexstate.lasttok = END; - d->lexstate.laststart = true; - d->lexstate.parens = 0; - if (d->multibyte) + d->lex.ptr = s; + d->lex.left = len; + d->lex.lasttok = END; + d->lex.laststart = true; + d->lex.parens = 0; + if (d->localeinfo.multibyte) { - d->lexstate.cur_mb_len = 0; + d->lex.cur_mb_len = 0; memset (&d->mbs, 0, sizeof d->mbs); } if (!d->syntax.syntax_bits_set) dfaerror (_("no syntax specified")); - d->parsestate.tok = lex (d); - d->parsestate.depth = d->depth; + d->parse.tok = lex (d); + d->parse.depth = d->depth; regexp (d); - if (d->parsestate.tok != END) + if (d->parse.tok != END) dfaerror (_("unbalanced )")); addtok (d, END - d->nregexps); @@ -2169,7 +2055,6 @@ state_index (struct dfa *d, position_set const *s, int context) size_t hash = 0; int constraint = 0; state_num i, j; - bool curr_dependent = false; token first_end = 0; for (i = 0; i < s->nelem; ++i) @@ -2223,17 +2108,6 @@ state_index (struct dfa *d, position_set const *s, int context) } else if (d->tokens[s->elems[j].index] == BACKREF) constraint = NO_CONSTRAINT; - if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR) - { - int acceptable - = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE) - ? CTX_NEWLINE : 0) - | (SUCCEEDS_IN_CONTEXT (c, context, CTX_LETTER) - ? CTX_LETTER : 0) - | (SUCCEEDS_IN_CONTEXT (c, context, CTX_NONE) - ? CTX_NONE : 0)); - curr_dependent |= acceptable && (context & ~acceptable); - } } @@ -2244,7 +2118,6 @@ state_index (struct dfa *d, position_set const *s, int context) alloc_position_set (&d->states[i].elems, s->nelem); copy (s, &d->states[i].elems); d->states[i].context = context; - d->states[i].curr_dependent = curr_dependent; d->states[i].constraint = constraint; d->states[i].first_end = first_end; d->states[i].mbps.nelem = 0; @@ -2331,11 +2204,10 @@ charclass_context (struct dfa const *dfa, charclass c) int context = 0; unsigned int j; - if (tstbit (dfa->syntax.eolbyte, c)) - context |= CTX_NEWLINE; - for (j = 0; j < CHARCLASS_WORDS; ++j) { + if (c[j] & dfa->syntax.newline[j]) + context |= CTX_NEWLINE; if (c[j] & dfa->syntax.letters[j]) context |= CTX_LETTER; if (c[j] & ~(dfa->syntax.letters[j] | dfa->syntax.newline[j])) @@ -2700,25 +2572,22 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->multibyte && d->tokens[pos.index] == ANYCHAR) + else if (d->tokens[pos.index] == ANYCHAR) { - /* ANYCHAR must match a single character, so put it to - D->states[s].mbps which contains the positions which can - match with a single character not a byte. If all - positions with ANYCHAR do not depend on the context of - the next character, put its follows instead to + copyset (d->charclasses[d->canychar], matches); + + /* ANYCHAR must match with a single character, so we must put + it to D->states[s].mbps which contains the positions which + can match with a single character not a byte. If all + positions which has ANYCHAR does not depend on context of + next character, we put the follows instead of it to D->states[s].mbps to optimize. */ - if (d->states[s].curr_dependent) - { - if (d->states[s].mbps.nelem == 0) - alloc_position_set (&d->states[s].mbps, 1); - insert (pos, &d->states[s].mbps); - } - else if (SUCCEEDS_IN_CONTEXT (pos.constraint, - d->states[s].context, CTX_ANY)) + if (SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, + CTX_NONE)) { if (d->states[s].mbps.nelem == 0) - alloc_position_set (&d->states[s].mbps, 1); + alloc_position_set (&d->states[s].mbps, + d->follows[pos.index].nelem); for (j = 0; j < d->follows[pos.index].nelem; j++) insert (d->follows[pos.index].elems[j], &d->states[s].mbps); } @@ -2832,22 +2701,27 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) is to fail miserably. */ if (d->searchflag) { - /* Find the state(s) corresponding to the positions of state 0. */ - copy (&d->states[0].elems, &follows); - separate_contexts = state_separate_contexts (&follows); - state = state_index (d, &follows, separate_contexts ^ CTX_ANY); - if (separate_contexts & CTX_NEWLINE) - state_newline = state_index (d, &follows, CTX_NEWLINE); - else - state_newline = state; - if (separate_contexts & CTX_LETTER) - state_letter = state_index (d, &follows, CTX_LETTER); - else - state_letter = state; + int c; + + state_newline = 0; + state_letter = d->min_trcount - 1; + state = d->initstate_notbol; - for (i = 0; i < NOTCHAR; ++i) - trans[i] = unibyte_word_constituent (i) ? state_letter : state; - trans[d->syntax.eolbyte] = state_newline; + for (c = 0; c < NOTCHAR; ++c) + { + switch (d->syntax.sbit[c]) + { + case CTX_NEWLINE: + trans[c] = state_newline; + break; + case CTX_LETTER: + trans[c] = state_letter; + break; + default: + trans[c] = state; + break; + } + } } else for (i = 0; i < NOTCHAR; ++i) @@ -2863,7 +2737,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k) insert (d->follows[grps[i].elems[j]].elems[k], &follows); - if (d->multibyte) + if (d->localeinfo.multibyte) { /* If a token in follows.elems is not 1st byte of a multibyte character, or the states of follows must accept the bytes @@ -2896,7 +2770,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* If we are building a searching matcher, throw in the positions of state 0 as well. */ - if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte)) + if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte)) { merge (&d->states[0].elems, &follows, &tmp); copy (&tmp, &follows); @@ -2950,12 +2824,18 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) { int c = j * CHARCLASS_WORD_BITS + k; - if (c == d->syntax.eolbyte) - trans[c] = state_newline; - else if (unibyte_word_constituent (c)) - trans[c] = state_letter; - else if (c < NOTCHAR) - trans[c] = state; + switch (d->syntax.sbit[c]) + { + case CTX_NEWLINE: + trans[c] = state_newline; + break; + case CTX_LETTER: + trans[c] = state_letter; + break; + default: + trans[c] = state; + break; + } } } @@ -2993,7 +2873,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state) d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails); d->success = xnrealloc (d->success, newalloc, sizeof *d->success); d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines); - if (d->multibyte) + if (d->localeinfo.multibyte) { realtrans = d->mb_trans ? d->mb_trans - 1 : NULL; realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans); @@ -3005,7 +2885,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state) { d->trans[oldalloc] = NULL; d->fails[oldalloc] = NULL; - if (d->multibyte) + if (d->localeinfo.multibyte) d->mb_trans[oldalloc] = NULL; } } @@ -3039,7 +2919,7 @@ build_state (state_num s, struct dfa *d) } d->trcount = d->min_trcount; - if (d->multibyte) + if (d->localeinfo.multibyte) { for (i = d->min_trcount; i < d->tralloc; i++) { @@ -3096,16 +2976,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp) { state_num *t; - if (**pp == d->syntax.eolbyte) - { - /* S is always an initial state in transit_state, so the - transition table for the state must have been built already. */ - assert (d->trans[s] || d->fails[s]); - - ++*pp; - return d->newlines[s]; - } - if (d->trans[s]) t = d->trans[s]; else if (d->fails[s]) @@ -3132,15 +3002,12 @@ static state_num transit_state (struct dfa *d, state_num s, unsigned char const **pp, unsigned char const *end) { - state_num s1; + state_num s1, s2; wint_t wc; int separate_contexts; - state_num state, state_newline, mb_index; - size_t i, j; + size_t i; int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); - int context = wc == d->syntax.eolbyte ? CTX_NEWLINE : CTX_NONE; - bool context_newline = context == CTX_NEWLINE; /* This state has some operators which can match a multibyte character. */ d->mb_follows.nelem = 0; @@ -3152,31 +3019,9 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, s = transit_state_singlebyte (d, s, pp); *pp += mbclen - i; - if (d->states[s1].curr_dependent) + if (wc == WEOF) { - if (s < 0) - d->mb_follows.nelem = 0; - else - copy (&d->states[s].elems, &d->mb_follows); - - for (i = 0; i < d->states[s1].mbps.nelem; i++) - { - if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint, - d->states[s1].context, context)) - continue; - for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem; - j++) - insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j], - &d->mb_follows); - } - - separate_contexts = state_separate_contexts (&d->mb_follows); - if (context_newline && separate_contexts & CTX_NEWLINE) - s = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); - realloc_trans_if_necessary (d, s); - + /* It is an invalid character, so ANYCHAR is not accepted. */ return s; } @@ -3187,11 +3032,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, { if (MAX_TRCOUNT <= d->mb_trcount) { - state_num s2; - for (s2 = -1; s2 < d->tralloc; s2++) + state_num s3; + for (s3 = -1; s3 < d->tralloc; s3++) { - free (d->mb_trans[s2]); - d->mb_trans[s2] = NULL; + free (d->mb_trans[s3]); + d->mb_trans[s3] = NULL; } for (i = 0; i < d->sindex; i++) @@ -3201,22 +3046,16 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, d->states[s1].mb_trindex = d->mb_trcount++; } - mb_index = d->states[s1].mb_trindex * 2; - if (! d->mb_trans[s]) { enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] }; - enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE }; + enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE }; d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE); - for (i = 0; i < 2 * MAX_TRCOUNT; i++) + for (i = 0; i < MAX_TRCOUNT; i++) d->mb_trans[s][i] = -1; } - else - { - state = d->mb_trans[s][mb_index + context_newline]; - if (0 <= state) - return state; - } + else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0) + return d->mb_trans[s][d->states[s1].mb_trindex]; if (s < 0) copy (&d->states[s1].mbps, &d->mb_follows); @@ -3224,17 +3063,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows); separate_contexts = state_separate_contexts (&d->mb_follows); - state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); - if (separate_contexts & CTX_NEWLINE) - state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - state_newline = state; - realloc_trans_if_necessary (d, state_newline); + s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + realloc_trans_if_necessary (d, s2); - d->mb_trans[s][mb_index] = state; - d->mb_trans[s][mb_index + 1] = state_newline; + d->mb_trans[s][d->states[s1].mb_trindex] = s2; - return context_newline ? state_newline : state; + return s2; } /* The initial state may encounter a byte which is not a single byte character @@ -3254,16 +3088,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, Both P and MBP must be no larger than END. */ static unsigned char const * skip_remains_mb (struct dfa *d, unsigned char const *p, - unsigned char const *mbp, char const *end, wint_t *wcp) + unsigned char const *mbp, char const *end) { - wint_t wc = WEOF; + wint_t wc; if (d->syntax.never_trail[*p]) return p; while (mbp < p) mbp += mbs_to_wchar (&wc, (char const *) mbp, end - (char const *) mbp, d); - if (wcp != NULL) - *wcp = wc; return mbp; } @@ -3299,6 +3131,39 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, unsigned char saved_end; size_t nlcount = 0; + if (MAX_TRCOUNT <= d->sindex) + { + for (s = d->min_trcount; s < d->sindex; s++) + { + free (d->states[s].elems.elems); + free (d->states[s].mbps.elems); + } + d->sindex = d->min_trcount; + + if (d->trans) + { + for (s = 0; s < d->tralloc; s++) + { + free (d->trans[s]); + free (d->fails[s]); + d->trans[s] = d->fails[s] = NULL; + } + d->trcount = 0; + } + + if (d->localeinfo.multibyte && d->mb_trans) + { + for (s = -1; s < d->tralloc; s++) + { + free (d->mb_trans[s]); + d->mb_trans[s] = NULL; + } + for (s = 0; s < d->min_trcount; s++) + d->states[s].mb_trindex = -1; + d->mb_trcount = 0; + } + } + if (!d->tralloc) { realloc_trans_if_necessary (d, 1); @@ -3320,51 +3185,25 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, for (;;) { - if (multibyte) + while ((t = trans[s]) != NULL) { - while ((t = trans[s]) != NULL) + if (s < d->min_trcount) { - s1 = s; - - if (s < d->min_trcount) + if (!multibyte || d->states[s].mbps.nelem == 0) { - if (d->min_trcount == 1) - { - if (d->states[s].mbps.nelem == 0) - { - do - { - while (t[*p] == 0) - p++; - p = mbp = skip_remains_mb (d, p, mbp, end, NULL); - } - while (t[*p] == 0); - } - else - p = mbp = skip_remains_mb (d, p, mbp, end, NULL); - } - else - { - wint_t wc; - mbp = skip_remains_mb (d, p, mbp, end, &wc); - - /* If d->min_trcount is greater than 1, maybe - transit to another initial state after skip. */ - if (p < mbp) - { - /* It's CTX_LETTER or CTX_NONE. CTX_NEWLINE - cannot happen, as we assume that a newline - is always a single byte character. */ - s1 = s = d->initstate_notbol; - p = mbp; - } - } + while (t[*p] == s) + p++; } + if (multibyte) + p = mbp = skip_remains_mb (d, p, mbp, end); + } + + if (multibyte) + { + s1 = s; - if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl) - || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + if (d->states[s].mbps.nelem == 0 + || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end) { /* If an input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3373,28 +3212,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } } - } - else - { - if (s == 0) - { - t = trans[s]; - if (t) - { - while (t[*p] == 0) - p++; - s1 = 0; - s = t[*p++]; - } - } - - while ((t = trans[s]) != NULL) + else { s1 = t[*p++]; t = trans[s1]; @@ -3405,6 +3227,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, s1 = tmp; /* swap */ break; } + if (s < d->min_trcount) + { + while (t[*p] == s1) + p++; + } s = t[*p++]; } } @@ -3422,19 +3249,25 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, nlcount++; mbp = p; - s = allow_nl ? d->newlines[s1] : 0; + s = (allow_nl ? d->newlines[s1] + : d->syntax.sbit[eol] == CTX_NEWLINE ? 0 + : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1 + : d->initstate_notbol); } else if (d->fails[s]) { - if (d->success[s] & d->syntax.sbit[*p]) + if ((d->success[s] & d->syntax.sbit[*p]) + || ((char *) p == end + && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, + *d))) goto done; + if (multibyte && s < d->min_trcount) + p = mbp = skip_remains_mb (d, p, mbp, end); + s1 = s; if (!multibyte || d->states[s].mbps.nelem == 0 - || (*p == eol && !allow_nl) - || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end) { /* If a input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3443,8 +3276,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } @@ -3490,7 +3321,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end, return (char *) begin; } -/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte), +/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte), but faster and set *BACKREF if the DFA code does not support this regexp usage. */ @@ -3548,7 +3379,7 @@ dfa_supported (struct dfa const *d) case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (!d->multibyte) + if (!d->localeinfo.multibyte) continue; /* fallthrough */ @@ -3566,7 +3397,7 @@ dfaoptimize (struct dfa *d) size_t i; bool have_backref = false; - if (!using_utf8) + if (!d->localeinfo.using_utf8) return; for (i = 0; i < d->tindex; ++i) @@ -3596,7 +3427,7 @@ dfaoptimize (struct dfa *d) } free_mbdata (d); - d->multibyte = false; + d->localeinfo.multibyte = false; d->dfaexec = dfaexec_sb; d->fast = true; } @@ -3611,7 +3442,7 @@ dfassbuild (struct dfa *d) struct dfa *sup = dfaalloc (); *sup = *d; - sup->multibyte = false; + sup->localeinfo.multibyte = false; sup->dfaexec = dfaexec_sb; sup->multibyte_prop = NULL; sup->mbcsets = NULL; @@ -3644,7 +3475,7 @@ dfassbuild (struct dfa *d) case BACKREF: zeroset (ccl); notset (ccl); - sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl); + sup->tokens[j++] = CSET + charclass_index (sup, ccl); sup->tokens[j++] = STAR; if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR || d->tokens[i + 1] == PLUS) @@ -3655,13 +3486,14 @@ dfassbuild (struct dfa *d) case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (d->multibyte) + if (d->localeinfo.multibyte) { /* These constraints aren't supported in a multibyte locale. Ignore them in the superset DFA. */ sup->tokens[j++] = EMPTY; break; } + /* fallthrough */ default: sup->tokens[j++] = d->tokens[i]; if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR) @@ -3672,7 +3504,7 @@ dfassbuild (struct dfa *d) } sup->tindex = j; - if (have_nchar && (have_achar || d->multibyte)) + if (have_nchar && (have_achar || d->localeinfo.multibyte)) d->superset = sup; else { @@ -3714,7 +3546,7 @@ dfafree (struct dfa *d) free (d->charclasses); free (d->tokens); - if (d->multibyte) + if (d->localeinfo.multibyte) free_mbdata (d); for (i = 0; i < d->sindex; ++i) @@ -4238,20 +4070,51 @@ dfamustfree (struct dfamust *dm) struct dfa * dfaalloc (void) { - struct dfa *d = xcalloc (1, sizeof (struct dfa)); - d->multibyte = MB_CUR_MAX > 1; - d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; - d->fast = !d->multibyte; - d->lexstate.cur_mb_len = 1; - return d; + return xmalloc (sizeof (struct dfa)); } +/* Initialize DFA. */ void -dfa_init (void) +dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, + reg_syntax_t bits, int dfaopts) { - check_utf8 (); - check_unibyte_c (); - init_mbrtowc_cache (); + int i; + memset (dfa, 0, offsetof (struct dfa, dfaexec)); + dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb; + dfa->simple_locale = using_simple_locale (linfo->multibyte); + dfa->localeinfo = *linfo; + + dfa->fast = !dfa->localeinfo.multibyte; + + dfa->canychar = -1; + dfa->lex.cur_mb_len = 1; + dfa->syntax.syntax_bits_set = true; + dfa->syntax.case_fold = (dfaopts & DFA_CASE_FOLD) != 0; + dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0; + dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n'; + dfa->syntax.syntax_bits = bits; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + unsigned char uc = i; + + dfa->syntax.sbit[uc] = char_context (dfa, uc); + switch (dfa->syntax.sbit[uc]) + { + case CTX_LETTER: + setbit (uc, dfa->syntax.letters); + break; + case CTX_NEWLINE: + setbit (uc, dfa->syntax.newline); + break; + } + + /* POSIX requires that the five bytes in "\n\r./" (including the + terminating NUL) cannot occur inside a multibyte character. */ + dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8 + ? (uc & 0xc0) != 0x80 + : strchr ("\n\r./", uc) != NULL); + } } /* vim:set shiftwidth=2: */ |