diff options
Diffstat (limited to 'dfa.c')
-rw-r--r-- | dfa.c | 1474 |
1 files changed, 644 insertions, 830 deletions
@@ -59,7 +59,6 @@ #define _(str) gettext (str) #include <wchar.h> -#include <wctype.h> #include "xalloc.h" @@ -69,6 +68,8 @@ #include "dfa.h" +#include "localeinfo.h" + #ifdef GAWK static int is_blank (int c) @@ -77,14 +78,6 @@ is_blank (int c) } #endif /* GAWK */ -#ifdef LIBC_IS_BORKED -extern int gawk_mb_cur_max; -#undef MB_CUR_MAX -#define MB_CUR_MAX gawk_mb_cur_max -#undef mbrtowc -#define mbrtowc(a, b, c, d) (-1) -#endif - /* HPUX defines these as macros in sys/param.h. */ #ifdef setbit # undef setbit @@ -337,9 +330,6 @@ typedef struct size_t hash; /* Hash of the positions of this state. */ position_set elems; /* Positions this state could match. */ unsigned char context; /* Context from previous state. */ - bool curr_dependent; /* True if the follows of any positions with - ANYCHAR depends on the next character's - context. */ unsigned short constraint; /* Constraint for this state to accept. */ token first_end; /* Token value of the first END in elems. */ position_set mbps; /* Positions which can match multibyte @@ -350,7 +340,8 @@ typedef struct ANYCHAR. */ } dfa_state; -/* Maximum for any transition table count that exceeds min_trcount. */ +/* Maximum for any transition table count. This should be at least 3, + for the initial state setup. */ enum { MAX_TRCOUNT = 1024 }; /* A bracket operator. @@ -372,6 +363,10 @@ struct regex_syntax /* Flag for case-folding letters into sets. */ bool case_fold; + /* True if ^ and $ match only the start and end of data, and do not match + end-of-line within data. */ + bool anchor; + /* End-of-line byte in data. */ unsigned char eolbyte; @@ -395,8 +390,8 @@ struct regex_syntax meaning of the @#%!@#%^!@ syntax bits. */ struct lexer_state { - char const *lexptr; /* Pointer to next input character. */ - size_t lexleft; /* Number of characters remaining. */ + char const *ptr; /* Pointer to next input character. */ + size_t left; /* Number of characters remaining. */ token lasttok; /* Previous token returned; initially END. */ size_t parens; /* Count of outstanding left parens. */ int minrep, maxrep; /* Repeat counts for {m,n}. */ @@ -435,12 +430,13 @@ struct dfa charclass *charclasses; /* Array of character sets for CSET tokens. */ size_t cindex; /* Index for adding new charclasses. */ size_t calloc; /* Number of charclasses allocated. */ + size_t canychar; /* Index of anychar class, or (size_t) -1. */ /* Scanner state */ - struct lexer_state lexstate; + struct lexer_state lex; /* Parser state */ - struct parser_state parsestate; + struct parser_state parse; /* Fields filled by the parser. */ token *tokens; /* Postfix parse array. */ @@ -453,14 +449,9 @@ struct dfa size_t nregexps; /* Count of parallel regexps being built with dfaparse. */ bool fast; /* The DFA is fast. */ - bool multibyte; /* MB_CUR_MAX > 1. */ token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ mbstate_t mbs; /* Multibyte conversion state. */ - /* dfaexec implementation. */ - char *(*dfaexec) (struct dfa *, char const *, char *, - bool, size_t *, bool *); - /* The following are valid only if MB_CUR_MAX > 1. */ /* The value of multibyte_prop[i] is defined by following rule. @@ -513,22 +504,26 @@ struct dfa /* Fields filled by dfaexec. */ state_num tralloc; /* Number of transition tables that have - slots so far, not counting trans[-1]. */ + slots so far, not counting trans[-1] and + trans[-2]. */ int trcount; /* Number of transition tables that have - actually been built. */ - int min_trcount; /* Minimum of number of transition tables. - Always keep the number, even after freeing - the transition tables. It is also the - number of initial states. */ + been built, other than for initial + states. */ + int min_trcount; /* Number of initial states. Equivalently, + the minimum state number for which trcount + counts transitions. */ state_num **trans; /* Transition tables for states that can never accept. If the transitions for a state have not yet been computed, or the state could possibly accept, its entry in this table is NULL. This points to one past the start of the allocated array, - and trans[-1] is always NULL. */ + and trans[-1] and trans[-2] are always + NULL. */ state_num **fails; /* Transition tables after failing to accept - on a state that potentially could do so. */ + on a state that potentially could do so. + If trans[i] is non-null, fails[i] must + be null. */ int *success; /* Table of acceptance conditions used in dfaexec and computed in build_state. */ state_num *newlines; /* Transitions on newlines. The entry for a @@ -543,9 +538,25 @@ struct dfa do not distinguish between their contexts, as not supported word. */ position_set mb_follows; /* Follow set added by ANYCHAR on demand. */ - state_num **mb_trans; /* Transition tables for states with ANYCHAR. */ + state_num **mb_trans; /* Transition tables for states with + ANYCHAR. */ state_num mb_trcount; /* Number of transition tables for states with ANYCHAR that have actually been built. */ + + /* Information derived from the locale. This is at the end so that + a quick memset need not clear it specially. */ + + /* dfaexec implementation. */ + char *(*dfaexec) (struct dfa *, char const *, char *, + bool, size_t *, bool *); + + /* The locale is simple, like the C locale. These locales can be + processed more efficiently, e.g., the relationship between lower- + and upper-case letters is 1-1. */ + bool simple_locale; + + /* Other cached information derived from the locale. */ + struct localeinfo localeinfo; }; /* Some macros for user access to dfa internals. */ @@ -559,13 +570,8 @@ struct dfa static void regexp (struct dfa *dfa); -/* A table indexed by byte values that contains the corresponding wide - character (if any) for that byte. WEOF means the byte is not a - valid single-byte character. */ -static wint_t mbrtowc_cache[NOTCHAR]; - /* Store into *PWC the result of converting the leading bytes of the - multibyte buffer S of length N bytes, using the mbrtowc_cache in *D + multibyte buffer S of length N bytes, using D->localeinfo.sbctowc and updating the conversion state in *D. On conversion error, convert just a single byte, to WEOF. Return the number of bytes converted. @@ -574,7 +580,7 @@ static wint_t mbrtowc_cache[NOTCHAR]; * PWC points to wint_t, not to wchar_t. * The last arg is a dfa *D instead of merely a multibyte conversion - state D->mbs. D also contains an mbrtowc_cache for speed. + state D->mbs. * N must be at least 1. * S[N - 1] must be a sentinel byte. * Shift encodings are not supported. @@ -585,7 +591,7 @@ static size_t mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) { unsigned char uc = s[0]; - wint_t wc = mbrtowc_cache[uc]; + wint_t wc = d->localeinfo.sbctowc[uc]; if (wc == WEOF) { @@ -715,10 +721,17 @@ zeroset (charclass s) } static void -notset (charclass s) +fillset (charclass s) { int i; + for (i = 0; i < CHARCLASS_WORDS; i++) + s[i] = CHARCLASS_WORD_MASK; +} +static void +notset (charclass s) +{ + int i; for (i = 0; i < CHARCLASS_WORDS; ++i) s[i] = CHARCLASS_WORD_MASK & ~s[i]; } @@ -762,7 +775,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) /* In DFA D, find the index of charclass S, or allocate a new one. */ static size_t -dfa_charclass_index (struct dfa *d, charclass const s) +charclass_index (struct dfa *d, charclass const s) { size_t i; @@ -777,76 +790,36 @@ dfa_charclass_index (struct dfa *d, charclass const s) } static bool -unibyte_word_constituent (unsigned char c) +unibyte_word_constituent (struct dfa const *dfa, unsigned char c) { - return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_'); + return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_'); } static int char_context (struct dfa const *dfa, unsigned char c) { - if (c == dfa->syntax.eolbyte) + if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor) return CTX_NEWLINE; - if (unibyte_word_constituent (c)) + if (unibyte_word_constituent (dfa, c)) return CTX_LETTER; return CTX_NONE; } -/* UTF-8 encoding allows some optimizations that we can't otherwise - assume in a multibyte encoding. */ -static bool using_utf8; - -bool -dfa_using_utf8 (void) -{ - return using_utf8; -} - -static void -init_mbrtowc_cache (void) -{ - int i; - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - char c = i; - unsigned char uc = i; - mbstate_t s = { 0 }; - wchar_t wc; - mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; - } -} - -/* Entry point to set syntax options. */ +/* Copy the syntax settings from one dfa instance to another. + Saves considerable computation time if compiling many regular expressions + based on the same setting. */ void -dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol) +dfacopysyntax (struct dfa *to, const struct dfa *from) { - int i; - dfa->syntax.syntax_bits_set = true; - dfa->syntax.syntax_bits = bits; - dfa->syntax.case_fold = fold; - dfa->syntax.eolbyte = eol; + to->dfaexec = from->dfaexec; + to->simple_locale = from->simple_locale; + to->localeinfo = from->localeinfo; - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - unsigned char uc = i; - - /* Use mbrtowc_cache to calculate sbit. */ - dfa->syntax.sbit[uc] = char_context (dfa, uc); - switch (dfa->syntax.sbit[uc]) - { - case CTX_LETTER: - setbit (uc, dfa->syntax.letters); - break; - case CTX_NEWLINE: - setbit (uc, dfa->syntax.newline); - break; - } + to->fast = from->fast; - /* POSIX requires that the five bytes in "\n\r./" (including the - terminating NUL) cannot occur inside a multibyte character. */ - dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80 - : strchr ("\n\r./", uc) != NULL); - } + to->canychar = from->canychar; + to->lex.cur_mb_len = from->lex.cur_mb_len; + to->syntax = from->syntax; } /* Set a bit in the charclass for the given wchar_t. Do nothing if WC @@ -877,30 +850,10 @@ setbit_case_fold_c (int b, charclass c) setbit (i, c); } -static void check_utf8 (void) -{ - wchar_t wc; - mbstate_t mbs = { 0 }; - using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; -} - -static bool unibyte_c; - -static void check_unibyte_c (void) -{ - char const *locale = setlocale (LC_ALL, NULL); - unibyte_c = (!locale - || STREQ (locale, "C") - || STREQ (locale, "POSIX")); -} - -/* The current locale is known to be a unibyte locale - without multicharacter collating sequences and where range - comparisons simply use the native encoding. These locales can be - processed more efficiently. */ +/* Return true if the locale compatible with the C locale. */ static bool -using_simple_locale (struct dfa const *dfa) +using_simple_locale (bool multibyte) { /* The native character set is known to be compatible with the C locale. The following test isn't perfect, but it's good @@ -918,7 +871,15 @@ using_simple_locale (struct dfa const *dfa) && '}' == 125 && '~' == 126) }; - return (!native_c_charset || dfa->multibyte) ? false : unibyte_c; + if (!native_c_charset || multibyte) + return false; + else + { + /* Treat C and POSIX locales as being compatible. Also, treat + errors as compatible, as these are invariably from stubs. */ + char const *loc = setlocale (LC_ALL, NULL); + return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX"); + } } /* Fetch the next lexical input character. Set C (of type int) to the @@ -930,23 +891,23 @@ using_simple_locale (struct dfa const *dfa) otherwise. */ # define FETCH_WC(dfa, c, wc, eoferr) \ do { \ - if (! dfa->lexstate.lexleft) \ + if (! (dfa)->lex.left) \ { \ if ((eoferr) != 0) \ dfaerror (eoferr); \ else \ - return dfa->lexstate.lasttok = END; \ + return (dfa)->lex.lasttok = END; \ } \ else \ { \ wint_t _wc; \ - size_t nbytes = mbs_to_wchar (&_wc, dfa->lexstate.lexptr, \ - dfa->lexstate.lexleft, dfa); \ - dfa->lexstate.cur_mb_len = nbytes; \ + size_t nbytes = mbs_to_wchar (&_wc, (dfa)->lex.ptr, \ + (dfa)->lex.left, dfa); \ + (dfa)->lex.cur_mb_len = nbytes; \ (wc) = _wc; \ - (c) = nbytes == 1 ? to_uchar (*dfa->lexstate.lexptr) : EOF; \ - dfa->lexstate.lexptr += nbytes; \ - dfa->lexstate.lexleft -= nbytes; \ + (c) = nbytes == 1 ? to_uchar ((dfa)->lex.ptr[0]) : EOF; \ + (dfa)->lex.ptr += nbytes; \ + (dfa)->lex.left -= nbytes; \ } \ } while (false) @@ -954,53 +915,6 @@ using_simple_locale (struct dfa const *dfa) # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif -/* The set of wchar_t values C such that there's a useful locale - somewhere where C != towupper (C) && C != towlower (towupper (C)). - For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because - towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and - towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ -static short const lonesome_lower[] = - { - 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, - 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, - - /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase - counterpart in locales predating Unicode 4.0.0 (April 2003). */ - 0x03F2, - - 0x03F5, 0x1E9B, 0x1FBE, - }; - -/* Maximum number of characters that can be the case-folded - counterparts of a single character, not counting the character - itself. This is 1 for towupper, 1 for towlower, and 1 for each - entry in LONESOME_LOWER. */ -enum -{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower }; - -/* Find the characters equal to C after case-folding, other than C - itself, and store them into FOLDED. Return the number of characters - stored. */ -static unsigned int -case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) -{ - unsigned int i; - unsigned int n = 0; - wint_t uc = towupper (c); - wint_t lc = towlower (uc); - if (uc != c) - folded[n++] = uc; - if (lc != uc && lc != c && towupper (lc) == uc) - folded[n++] = lc; - for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) - { - wint_t li = lonesome_lower[i]; - if (li != lc && li != uc && li != c && towupper (li) == uc) - folded[n++] = li; - } - return n; -} - typedef int predicate (int); /* The following list maps the names of the Posix named character classes @@ -1069,7 +983,7 @@ parse_bracket_exp (struct dfa *dfa) size_t chars_al; chars_al = 0; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets, &dfa->mbcsets_alloc, @@ -1092,7 +1006,7 @@ parse_bracket_exp (struct dfa *dfa) { FETCH_WC (dfa, c, wc, _("unbalanced [")); invert = true; - known_bracket_exp = using_simple_locale (dfa); + known_bracket_exp = dfa->simple_locale; } else invert = false; @@ -1120,8 +1034,8 @@ parse_bracket_exp (struct dfa *dfa) for (;;) { FETCH_WC (dfa, c, wc, _("unbalanced [")); - if ((c == c1 && *dfa->lexstate.lexptr == ']') - || dfa->lexstate.lexleft == 0) + if (dfa->lex.left == 0 + || (c == c1 && dfa->lex.ptr[0] == ']')) break; if (len < MAX_BRACKET_STRING_LEN) str[len++] = c; @@ -1141,13 +1055,13 @@ parse_bracket_exp (struct dfa *dfa) { char const *class = (dfa->syntax.case_fold && (STREQ (str, "upper") - || STREQ (str, "lower")) ? - "alpha" : str); + || STREQ (str, "lower")) + ? "alpha" : str); const struct dfa_ctype *pred = find_pred (class); if (!pred) dfaerror (_("invalid character class")); - if (dfa->multibyte && !pred->single_byte_only) + if (dfa->localeinfo.multibyte && !pred->single_byte_only) known_bracket_exp = false; else for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1182,7 +1096,7 @@ parse_bracket_exp (struct dfa *dfa) /* A bracket expression like [a-[.aa.]] matches an unknown set. Treat it like [-a[.aa.]] while parsing it, and remember that the set is unknown. */ - if (c2 == '[' && *dfa->lexstate.lexptr == '.') + if (c2 == '[' && dfa->lex.ptr[0] == '.') { known_bracket_exp = false; c2 = ']'; @@ -1192,8 +1106,8 @@ parse_bracket_exp (struct dfa *dfa) { /* In the case [x-], the - is an ordinary hyphen, which is left in c1, the lookahead character. */ - dfa->lexstate.lexptr -= dfa->lexstate.cur_mb_len; - dfa->lexstate.lexleft += dfa->lexstate.cur_mb_len; + dfa->lex.ptr -= dfa->lex.cur_mb_len; + dfa->lex.left += dfa->lex.cur_mb_len; } else { @@ -1207,9 +1121,9 @@ parse_bracket_exp (struct dfa *dfa) /* Treat [x-y] as a range if x != y. */ if (wc != wc2 || wc == WEOF) { - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) known_bracket_exp = false; - else if (using_simple_locale (dfa)) + else if (dfa->simple_locale) { int ci; for (ci = c; ci <= c2; ci++) @@ -1236,7 +1150,7 @@ parse_bracket_exp (struct dfa *dfa) colon_warning_state |= (c == ':') ? 2 : 4; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { if (dfa->syntax.case_fold) setbit_case_fold_c (c, ccl); @@ -1273,37 +1187,45 @@ parse_bracket_exp (struct dfa *dfa) if (! known_bracket_exp) return BACKREF; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { work_mbc->invert = invert; - work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl); + work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl); return MBCSET; } if (invert) { - assert (!dfa->multibyte); + assert (!dfa->localeinfo.multibyte); notset (ccl); if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit ('\n', ccl); } - return CSET + dfa_charclass_index (dfa, ccl); + return CSET + charclass_index (dfa, ccl); } -#define PUSH_LEX_STATE(s) \ - do \ - { \ - char const *lexptr_saved = dfa->lexstate.lexptr; \ - size_t lexleft_saved = dfa->lexstate.lexleft; \ - dfa->lexstate.lexptr = (s); \ - dfa->lexstate.lexleft = strlen (dfa->lexstate.lexptr) +struct lexptr +{ + char const *ptr; + size_t left; +}; + +static void +push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s) +{ + ls->ptr = dfa->lex.ptr; + ls->left = dfa->lex.left; + dfa->lex.ptr = s; + dfa->lex.left = strlen (s); +} -#define POP_LEX_STATE() \ - dfa->lexstate.lexptr = lexptr_saved; \ - dfa->lexstate.lexleft = lexleft_saved; \ - } \ - while (false) +static void +pop_lex_state (struct dfa *dfa, struct lexptr const *ls) +{ + dfa->lex.ptr = ls->ptr; + dfa->lex.left = ls->left; +} static token lex (struct dfa *dfa) @@ -1321,14 +1243,14 @@ lex (struct dfa *dfa) "if (backslash) ...". */ for (i = 0; i < 2; ++i) { - FETCH_WC (dfa, c, dfa->lexstate.wctok, NULL); + FETCH_WC (dfa, c, dfa->lex.wctok, NULL); switch (c) { case '\\': if (backslash) goto normal_char; - if (dfa->lexstate.lexleft == 0) + if (dfa->lex.left == 0) dfaerror (_("unfinished \\ escape")); backslash = true; break; @@ -1337,28 +1259,29 @@ lex (struct dfa *dfa) if (backslash) goto normal_char; if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS - || dfa->lexstate.lasttok == END || dfa->lexstate.lasttok == LPAREN - || dfa->lexstate.lasttok == OR) - return dfa->lexstate.lasttok = BEGLINE; + || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN + || dfa->lex.lasttok == OR) + return dfa->lex.lasttok = BEGLINE; goto normal_char; case '$': if (backslash) goto normal_char; if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS - || dfa->lexstate.lexleft == 0 - || (dfa->syntax.syntax_bits & RE_NO_BK_PARENS - ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == ')' - : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\' - && dfa->lexstate.lexptr[1] == ')') - || (dfa->syntax.syntax_bits & RE_NO_BK_VBAR - ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == '|' - : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\' - && dfa->lexstate.lexptr[1] == '|') + || dfa->lex.left == 0 + || ((dfa->lex.left + > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)) + && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) + & (dfa->lex.ptr[0] == '\\')] + == ')')) + || ((dfa->lex.left + > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)) + && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR) + & (dfa->lex.ptr[0] == '\\')] + == '|')) || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT) - && dfa->lexstate.lexleft > 0 - && *dfa->lexstate.lexptr == '\n')) - return dfa->lexstate.lasttok = ENDLINE; + && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n')) + return dfa->lex.lasttok = ENDLINE; goto normal_char; case '1': @@ -1372,8 +1295,8 @@ lex (struct dfa *dfa) case '9': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS)) { - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = BACKREF; + dfa->lex.laststart = false; + return dfa->lex.lasttok = BACKREF; } goto normal_char; @@ -1381,7 +1304,7 @@ lex (struct dfa *dfa) if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) { /* FIXME: should be beginning of string */ - return dfa->lexstate.lasttok = BEGLINE; + return dfa->lex.lasttok = BEGLINE; } goto normal_char; @@ -1389,28 +1312,28 @@ lex (struct dfa *dfa) if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) { /* FIXME: should be end of string */ - return dfa->lexstate.lasttok = ENDLINE; + return dfa->lex.lasttok = ENDLINE; } goto normal_char; case '<': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = BEGWORD; + return dfa->lex.lasttok = BEGWORD; goto normal_char; case '>': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = ENDWORD; + return dfa->lex.lasttok = ENDWORD; goto normal_char; case 'b': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = LIMWORD; + return dfa->lex.lasttok = LIMWORD; goto normal_char; case 'B': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = NOTLIMWORD; + return dfa->lex.lasttok = NOTLIMWORD; goto normal_char; case '?': @@ -1419,17 +1342,17 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = QMARK; + return dfa->lex.lasttok = QMARK; case '*': if (backslash) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = STAR; + return dfa->lex.lasttok = STAR; case '+': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) @@ -1437,9 +1360,9 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = PLUS; + return dfa->lex.lasttok = PLUS; case '{': if (!(dfa->syntax.syntax_bits & RE_INTERVALS)) @@ -1447,7 +1370,7 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; /* Cases: @@ -1457,111 +1380,106 @@ lex (struct dfa *dfa) {,} - 0 to infinity (same as '*') {M,N} - M through N */ { - char const *p = dfa->lexstate.lexptr; - char const *lim = p + dfa->lexstate.lexleft; - dfa->lexstate.minrep = dfa->lexstate.maxrep = -1; + char const *p = dfa->lex.ptr; + char const *lim = p + dfa->lex.left; + dfa->lex.minrep = dfa->lex.maxrep = -1; for (; p != lim && ISASCIIDIGIT (*p); p++) - { - if (dfa->lexstate.minrep < 0) - dfa->lexstate.minrep = *p - '0'; - else - dfa->lexstate.minrep = MIN (RE_DUP_MAX + 1, - (dfa->lexstate.minrep - * 10 + *p - '0')); - } + dfa->lex.minrep = (dfa->lex.minrep < 0 + ? *p - '0' + : MIN (RE_DUP_MAX + 1, + dfa->lex.minrep * 10 + *p - '0')); if (p != lim) { if (*p != ',') - dfa->lexstate.maxrep = dfa->lexstate.minrep; + dfa->lex.maxrep = dfa->lex.minrep; else { - if (dfa->lexstate.minrep < 0) - dfa->lexstate.minrep = 0; + if (dfa->lex.minrep < 0) + dfa->lex.minrep = 0; while (++p != lim && ISASCIIDIGIT (*p)) - { - if (dfa->lexstate.maxrep < 0) - dfa->lexstate.maxrep = *p - '0'; - else - dfa->lexstate.maxrep = MIN (RE_DUP_MAX + 1, - (dfa->lexstate.maxrep - * 10 + *p - '0')); - } + dfa->lex.maxrep + = (dfa->lex.maxrep < 0 + ? *p - '0' + : MIN (RE_DUP_MAX + 1, + dfa->lex.maxrep * 10 + *p - '0')); } } if (! ((! backslash || (p != lim && *p++ == '\\')) && p != lim && *p++ == '}' - && 0 <= dfa->lexstate.minrep - && (dfa->lexstate.maxrep < 0 - || dfa->lexstate.minrep <= dfa->lexstate.maxrep))) + && 0 <= dfa->lex.minrep + && (dfa->lex.maxrep < 0 + || dfa->lex.minrep <= dfa->lex.maxrep))) { if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD) goto normal_char; dfaerror (_("invalid content of \\{\\}")); } - if (RE_DUP_MAX < dfa->lexstate.maxrep) + if (RE_DUP_MAX < dfa->lex.maxrep) dfaerror (_("regular expression too big")); - dfa->lexstate.lexptr = p; - dfa->lexstate.lexleft = lim - p; + dfa->lex.ptr = p; + dfa->lex.left = lim - p; } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = REPMN; + dfa->lex.laststart = false; + return dfa->lex.lasttok = REPMN; case '|': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0)) goto normal_char; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = OR; + dfa->lex.laststart = true; + return dfa->lex.lasttok = OR; case '\n': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT)) goto normal_char; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = OR; + dfa->lex.laststart = true; + return dfa->lex.lasttok = OR; case '(': if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; - ++dfa->lexstate.parens; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = LPAREN; + dfa->lex.parens++; + dfa->lex.laststart = true; + return dfa->lex.lasttok = LPAREN; case ')': if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; - if (dfa->lexstate.parens == 0 + if (dfa->lex.parens == 0 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; - --dfa->lexstate.parens; - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = RPAREN; + dfa->lex.parens--; + dfa->lex.laststart = false; + return dfa->lex.lasttok = RPAREN; case '.': if (backslash) goto normal_char; - if (dfa->multibyte) + if (dfa->canychar == (size_t) -1) { - /* In multibyte environment period must match with a single - character not a byte. So we use ANYCHAR. */ - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = ANYCHAR; + fillset (ccl); + if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE)) + clrbit ('\n', ccl); + if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) + clrbit ('\0', ccl); + if (dfa->localeinfo.multibyte) + for (c2 = 0; c2 < NOTCHAR; c2++) + if (dfa->localeinfo.sbctowc[c2] == WEOF) + clrbit (c2, ccl); + dfa->canychar = charclass_index (dfa, ccl); } - zeroset (ccl); - notset (ccl); - if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE)) - clrbit ('\n', ccl); - if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) - clrbit ('\0', ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = (dfa->localeinfo.multibyte + ? ANYCHAR + : CSET + dfa->canychar); case 's': case 'S': if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1569,9 +1487,8 @@ lex (struct dfa *dfa) setbit (c2, ccl); if (c == 'S') notset (ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1580,31 +1497,31 @@ lex (struct dfa *dfa) /* \s and \S are documented to be equivalent to [[:space:]] and [^[:space:]] respectively, so tell the lexer to process those strings, each minus its "already processed" '['. */ - PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]"); - - dfa->lexstate.lasttok = parse_bracket_exp (dfa); - - POP_LEX_STATE (); + { + struct lexptr ls; + push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']); + dfa->lex.lasttok = parse_bracket_exp (dfa); + pop_lex_state (dfa, &ls); + } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok; + dfa->lex.laststart = false; + return dfa->lex.lasttok; case 'w': case 'W': if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) - if (unibyte_word_constituent (c2)) + if (dfa->syntax.sbit[c2] == CTX_LETTER) setbit (c2, ccl); if (c == 'W') notset (ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1613,38 +1530,38 @@ lex (struct dfa *dfa) /* \w and \W are documented to be equivalent to [_[:alnum:]] and [^_[:alnum:]] respectively, so tell the lexer to process those strings, each minus its "already processed" '['. */ - PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]"); - - dfa->lexstate.lasttok = parse_bracket_exp (dfa); - - POP_LEX_STATE (); + { + struct lexptr ls; + push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']); + dfa->lex.lasttok = parse_bracket_exp (dfa); + pop_lex_state (dfa, &ls); + } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok; + dfa->lex.laststart = false; + return dfa->lex.lasttok; case '[': if (backslash) goto normal_char; - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = parse_bracket_exp (dfa); + dfa->lex.laststart = false; + return dfa->lex.lasttok = parse_bracket_exp (dfa); default: normal_char: - dfa->lexstate.laststart = false; + dfa->lex.laststart = false; /* For multibyte character sets, folding is done in atom. Always return WCHAR. */ - if (dfa->multibyte) - return dfa->lexstate.lasttok = WCHAR; + if (dfa->localeinfo.multibyte) + return dfa->lex.lasttok = WCHAR; if (dfa->syntax.case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } - return dfa->lexstate.lasttok = c; + return dfa->lex.lasttok = c; } } @@ -1661,11 +1578,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop) { dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc, sizeof *dfa->tokens); - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc, sizeof *dfa->multibyte_prop); } - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop[dfa->tindex] = mbprop; dfa->tokens[dfa->tindex++] = t; @@ -1678,21 +1595,21 @@ addtok_mb (struct dfa *dfa, token t, int mbprop) case CAT: case OR: - --dfa->parsestate.depth; + dfa->parse.depth--; break; case BACKREF: dfa->fast = false; /* fallthrough */ default: - ++dfa->nleaves; + dfa->nleaves++; /* fallthrough */ case EMPTY: - ++dfa->parsestate.depth; + dfa->parse.depth++; break; } - if (dfa->parsestate.depth > dfa->depth) - dfa->depth = dfa->parsestate.depth; + if (dfa->parse.depth > dfa->depth) + dfa->depth = dfa->parse.depth; } static void addtok_wc (struct dfa *dfa, wint_t wc); @@ -1702,7 +1619,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc); static void addtok (struct dfa *dfa, token t) { - if (dfa->multibyte && t == MBCSET) + if (dfa->localeinfo.multibyte && t == MBCSET) { bool need_or = false; struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1]; @@ -1749,19 +1666,19 @@ addtok_wc (struct dfa *dfa, wint_t wc) size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); if (stored_bytes != (size_t) -1) - dfa->lexstate.cur_mb_len = stored_bytes; + dfa->lex.cur_mb_len = stored_bytes; else { /* This is merely stop-gap. buf[0] is undefined, yet skipping the addtok_mb call altogether can corrupt the heap. */ - dfa->lexstate.cur_mb_len = 1; + dfa->lex.cur_mb_len = 1; buf[0] = 0; } - addtok_mb (dfa, buf[0], dfa->lexstate.cur_mb_len == 1 ? 3 : 1); - for (i = 1; i < dfa->lexstate.cur_mb_len; i++) + addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1); + for (i = 1; i < dfa->lex.cur_mb_len; i++) { - addtok_mb (dfa, buf[i], i == dfa->lexstate.cur_mb_len - 1 ? 2 : 0); + addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0); addtok (dfa, CAT); } } @@ -1801,7 +1718,7 @@ add_utf8_anychar (struct dfa *dfa) if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', c); } - dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c); + dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c); } /* A valid UTF-8 character is @@ -1862,18 +1779,18 @@ add_utf8_anychar (struct dfa *dfa) static void atom (struct dfa *dfa) { - if (dfa->parsestate.tok == WCHAR) + if (dfa->parse.tok == WCHAR) { - if (dfa->lexstate.wctok == WEOF) + if (dfa->lex.wctok == WEOF) addtok (dfa, BACKREF); else { - addtok_wc (dfa, dfa->lexstate.wctok); + addtok_wc (dfa, dfa->lex.wctok); if (dfa->syntax.case_fold) { wchar_t folded[CASE_FOLDED_BUFSIZE]; - unsigned int i, n = case_folded_counterparts (dfa->lexstate.wctok, + unsigned int i, n = case_folded_counterparts (dfa->lex.wctok, folded); for (i = 0; i < n; i++) { @@ -1883,9 +1800,9 @@ atom (struct dfa *dfa) } } - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == ANYCHAR && using_utf8) + else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1895,26 +1812,25 @@ atom (struct dfa *dfa) UTF-8: it is the most used, and the structure of the encoding makes the correctness more obvious. */ add_utf8_anychar (dfa); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if ((dfa->parsestate.tok >= 0 && dfa->parsestate.tok < NOTCHAR) - || dfa->parsestate.tok >= CSET || dfa->parsestate.tok == BACKREF - || dfa->parsestate.tok == BEGLINE || dfa->parsestate.tok == ENDLINE - || dfa->parsestate.tok == BEGWORD || dfa->parsestate.tok == ANYCHAR - || dfa->parsestate.tok == MBCSET || dfa->parsestate.tok == ENDWORD - || dfa->parsestate.tok == LIMWORD - || dfa->parsestate.tok == NOTLIMWORD) + else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR) + || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF + || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE + || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR + || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD + || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD) { - addtok (dfa, dfa->parsestate.tok); - dfa->parsestate.tok = lex (dfa); + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == LPAREN) + else if (dfa->parse.tok == LPAREN) { - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); regexp (dfa); - if (dfa->parsestate.tok != RPAREN) + if (dfa->parse.tok != RPAREN) dfaerror (_("unbalanced (")); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } else addtok (dfa, EMPTY); @@ -1947,7 +1863,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens) { size_t i; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) for (i = 0; i < ntokens; ++i) addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]); else @@ -1962,40 +1878,39 @@ closure (struct dfa *dfa) size_t tindex, ntokens; atom (dfa); - while (dfa->parsestate.tok == QMARK || dfa->parsestate.tok == STAR - || dfa->parsestate.tok == PLUS || dfa->parsestate.tok == REPMN) - if (dfa->parsestate.tok == REPMN - && (dfa->lexstate.minrep || dfa->lexstate.maxrep)) + while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR + || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN) + if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep)) { ntokens = nsubtoks (dfa, dfa->tindex); tindex = dfa->tindex - ntokens; - if (dfa->lexstate.maxrep < 0) + if (dfa->lex.maxrep < 0) addtok (dfa, PLUS); - if (dfa->lexstate.minrep == 0) + if (dfa->lex.minrep == 0) addtok (dfa, QMARK); - for (i = 1; i < dfa->lexstate.minrep; ++i) + for (i = 1; i < dfa->lex.minrep; i++) { copytoks (dfa, tindex, ntokens); addtok (dfa, CAT); } - for (; i < dfa->lexstate.maxrep; ++i) + for (; i < dfa->lex.maxrep; i++) { copytoks (dfa, tindex, ntokens); addtok (dfa, QMARK); addtok (dfa, CAT); } - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == REPMN) + else if (dfa->parse.tok == REPMN) { dfa->tindex -= nsubtoks (dfa, dfa->tindex); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); closure (dfa); } else { - addtok (dfa, dfa->parsestate.tok); - dfa->parsestate.tok = lex (dfa); + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); } } @@ -2003,8 +1918,8 @@ static void branch (struct dfa* dfa) { closure (dfa); - while (dfa->parsestate.tok != RPAREN && dfa->parsestate.tok != OR - && dfa->parsestate.tok >= 0) + while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR + && dfa->parse.tok >= 0) { closure (dfa); addtok (dfa, CAT); @@ -2015,9 +1930,9 @@ static void regexp (struct dfa *dfa) { branch (dfa); - while (dfa->parsestate.tok == OR) + while (dfa->parse.tok == OR) { - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); branch (dfa); addtok (dfa, OR); } @@ -2029,26 +1944,26 @@ regexp (struct dfa *dfa) static void dfaparse (char const *s, size_t len, struct dfa *d) { - d->lexstate.lexptr = s; - d->lexstate.lexleft = len; - d->lexstate.lasttok = END; - d->lexstate.laststart = true; - d->lexstate.parens = 0; - if (d->multibyte) + d->lex.ptr = s; + d->lex.left = len; + d->lex.lasttok = END; + d->lex.laststart = true; + d->lex.parens = 0; + if (d->localeinfo.multibyte) { - d->lexstate.cur_mb_len = 0; + d->lex.cur_mb_len = 0; memset (&d->mbs, 0, sizeof d->mbs); } if (!d->syntax.syntax_bits_set) dfaerror (_("no syntax specified")); - d->parsestate.tok = lex (d); - d->parsestate.depth = d->depth; + d->parse.tok = lex (d); + d->parse.depth = d->depth; regexp (d); - if (d->parsestate.tok != END) + if (d->parse.tok != END) dfaerror (_("unbalanced )")); addtok (d, END - d->nregexps); @@ -2169,7 +2084,6 @@ state_index (struct dfa *d, position_set const *s, int context) size_t hash = 0; int constraint = 0; state_num i, j; - bool curr_dependent = false; token first_end = 0; for (i = 0; i < s->nelem; ++i) @@ -2223,17 +2137,6 @@ state_index (struct dfa *d, position_set const *s, int context) } else if (d->tokens[s->elems[j].index] == BACKREF) constraint = NO_CONSTRAINT; - if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR) - { - int acceptable - = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE) - ? CTX_NEWLINE : 0) - | (SUCCEEDS_IN_CONTEXT (c, context, CTX_LETTER) - ? CTX_LETTER : 0) - | (SUCCEEDS_IN_CONTEXT (c, context, CTX_NONE) - ? CTX_NONE : 0)); - curr_dependent |= acceptable && (context & ~acceptable); - } } @@ -2244,7 +2147,6 @@ state_index (struct dfa *d, position_set const *s, int context) alloc_position_set (&d->states[i].elems, s->nelem); copy (s, &d->states[i].elems); d->states[i].context = context; - d->states[i].curr_dependent = curr_dependent; d->states[i].constraint = constraint; d->states[i].first_end = first_end; d->states[i].mbps.nelem = 0; @@ -2331,11 +2233,10 @@ charclass_context (struct dfa const *dfa, charclass c) int context = 0; unsigned int j; - if (tstbit (dfa->syntax.eolbyte, c)) - context |= CTX_NEWLINE; - for (j = 0; j < CHARCLASS_WORDS; ++j) { + if (c[j] & dfa->syntax.newline[j]) + context |= CTX_NEWLINE; if (c[j] & dfa->syntax.letters[j]) context |= CTX_LETTER; if (c[j] & ~(dfa->syntax.letters[j] | dfa->syntax.newline[j])) @@ -2626,6 +2527,7 @@ dfaanalyze (struct dfa *d, bool searchflag) if (separate_contexts & CTX_LETTER) d->min_trcount = state_index (d, &merged, CTX_LETTER); d->min_trcount++; + d->trcount = 0; free (posalloc); free (stkalloc); @@ -2634,91 +2536,93 @@ dfaanalyze (struct dfa *d, bool searchflag) } -/* Find, for each character, the transition out of state s of d, and store - it in the appropriate slot of trans. +/* Return the transition out of state s of d for the input character uc, + updating the slots in trans accordingly. - We divide the positions of s into groups (positions can appear in more - than one group). Each group is labeled with a set of characters that + Do not worry about all possible input characters; calculate just the group + of positions that match uc. Label it with the set of characters that every position in the group matches (taking into account, if necessary, - preceding context information of s). For each group, find the union - of the its elements' follows. This set is the set of positions of the + preceding context information of s). Then find the union + of these positions' follows, i.e., the set of positions of the new state. For each character in the group's label, set the transition on this character to be to a state corresponding to the set's positions, and its associated backward context information, if necessary. - If we are building a searching matcher, we include the positions of state + When building a searching matcher, include the positions of state 0 in every state. - The collection of groups is constructed by building an equivalence-class + The group is constructed by building an equivalence-class partition of the positions of s. For each position, find the set of characters C that it matches. Eliminate any characters from C that fail on grounds of backward context. - Search through the groups, looking for a group whose label L has nonempty + Check whether the group's label L has nonempty intersection with C. If L - C is nonempty, create a new group labeled L - C and having the same positions as the current group, and set L to - the intersection of L and C. Insert the position in this group, set + the intersection of L and C. Insert the position in the group, set C = C - L, and resume scanning. If after comparing with every group there are characters remaining in C, create a new group labeled with the characters of C and insert this position in that group. */ -static void -dfastate (state_num s, struct dfa *d, state_num trans[]) +static state_num +dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[]) { - leaf_set grps[NOTCHAR]; /* As many as will ever be needed. */ - charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ - size_t ngrps = 0; /* Number of groups actually used. */ - position pos; /* Current position being considered. */ - charclass matches; /* Set of matching characters. */ - charclass_word matchesf; /* Nonzero if matches is nonempty. */ - charclass intersect; /* Intersection with some label set. */ - charclass_word intersectf; /* Nonzero if intersect is nonempty. */ - charclass leftovers; /* Stuff in the label that didn't match. */ - charclass_word leftoversf; /* Nonzero if leftovers is nonempty. */ - position_set follows; /* Union of the follows of some group. */ + leaf_set group; /* Positions that match the input char. */ + charclass label; /* The group's label. */ + position_set follows; /* Union of the follows of the group. */ position_set tmp; /* Temporary space for merging sets. */ - int possible_contexts; /* Contexts that this group can match. */ - int separate_contexts; /* Context that new state wants to know. */ state_num state; /* New state. */ state_num state_newline; /* New state on a newline transition. */ state_num state_letter; /* New state on a letter transition. */ - bool next_isnt_1st_byte = false; /* We can't add state0. */ size_t i, j, k; #ifdef DEBUG fprintf (stderr, "build state %td\n", s); #endif - zeroset (matches); + group.elems = xnmalloc (d->nleaves, sizeof *group.elems); + group.nelem = 0; + + fillset (label); for (i = 0; i < d->states[s].elems.nelem; ++i) { - pos = d->states[s].elems.elems[i]; + charclass matches; /* Set of matching characters. */ + position pos = d->states[s].elems.elems[i]; + bool matched = false; if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) - setbit (d->tokens[pos.index], matches); + { + zeroset (matches); + setbit (d->tokens[pos.index], matches); + if (d->tokens[pos.index] == uc) + matched = true; + } else if (d->tokens[pos.index] >= CSET) - copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->multibyte && d->tokens[pos.index] == ANYCHAR) { - /* ANYCHAR must match a single character, so put it to - D->states[s].mbps which contains the positions which can - match with a single character not a byte. If all - positions with ANYCHAR do not depend on the context of - the next character, put its follows instead to + copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); + if (tstbit (uc, d->charclasses[d->tokens[pos.index] - CSET])) + matched = true; + } + else if (d->tokens[pos.index] == ANYCHAR) + { + copyset (d->charclasses[d->canychar], matches); + if (tstbit (uc, d->charclasses[d->canychar])) + matched = true; + + /* ANYCHAR must match with a single character, so we must put + it to D->states[s].mbps which contains the positions which + can match with a single character not a byte. If all + positions which has ANYCHAR does not depend on context of + next character, we put the follows instead of it to D->states[s].mbps to optimize. */ - if (d->states[s].curr_dependent) - { - if (d->states[s].mbps.nelem == 0) - alloc_position_set (&d->states[s].mbps, 1); - insert (pos, &d->states[s].mbps); - } - else if (SUCCEEDS_IN_CONTEXT (pos.constraint, - d->states[s].context, CTX_ANY)) + if (SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, + CTX_NONE)) { if (d->states[s].mbps.nelem == 0) - alloc_position_set (&d->states[s].mbps, 1); + alloc_position_set (&d->states[s].mbps, + d->follows[pos.index].nelem); for (j = 0; j < d->follows[pos.index].nelem; j++) insert (d->follows[pos.index].elems[j], &d->states[s].mbps); } @@ -2748,6 +2652,12 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) continue; if (j == CHARCLASS_WORDS) continue; + + /* If we have reset the bit that made us declare "matched", reset + that indicator, too. This is required to avoid an infinite loop + with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */ + if (!tstbit (uc, matches)) + matched = false; } #ifdef DEBUG @@ -2760,150 +2670,69 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) fprintf (stderr, "\n"); #endif - for (j = 0; j < ngrps; ++j) + if (matched) { - /* If matches contains a single character only, and the current - group's label doesn't contain that character, go on to the - next group. */ - if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR - && !tstbit (d->tokens[pos.index], labels[j])) - continue; - - /* Check if this group's label has a nonempty intersection with - matches. */ - intersectf = 0; - for (k = 0; k < CHARCLASS_WORDS; ++k) - intersectf |= intersect[k] = matches[k] & labels[j][k]; - if (!intersectf) - continue; - - /* It does; now find the set differences both ways. */ - leftoversf = matchesf = 0; for (k = 0; k < CHARCLASS_WORDS; ++k) - { - /* Even an optimizing compiler can't know this for sure. */ - charclass_word match = matches[k], label = labels[j][k]; - - leftoversf |= leftovers[k] = label & ~match; - matchesf |= matches[k] = match & ~label; - } - - /* If there were leftovers, create a new group labeled with them. */ - if (leftoversf) - { - copyset (leftovers, labels[ngrps]); - copyset (intersect, labels[j]); - grps[ngrps].elems = xnmalloc (d->nleaves, - sizeof *grps[ngrps].elems); - memcpy (grps[ngrps].elems, grps[j].elems, - sizeof (grps[j].elems[0]) * grps[j].nelem); - grps[ngrps].nelem = grps[j].nelem; - ++ngrps; - } - - /* Put the position in the current group. The constraint is - irrelevant here. */ - grps[j].elems[grps[j].nelem++] = pos.index; - - /* If every character matching the current position has been - accounted for, we're done. */ - if (!matchesf) - break; + label[k] &= matches[k]; + group.elems[group.nelem++] = pos.index; } - - /* If we've passed the last group, and there are still characters - unaccounted for, then we'll have to create a new group. */ - if (j == ngrps) + else { - copyset (matches, labels[ngrps]); - zeroset (matches); - grps[ngrps].elems = xnmalloc (d->nleaves, sizeof *grps[ngrps].elems); - grps[ngrps].nelem = 1; - grps[ngrps].elems[0] = pos.index; - ++ngrps; + for (k = 0; k < CHARCLASS_WORDS; ++k) + label[k] &= ~matches[k]; } } alloc_position_set (&follows, d->nleaves); alloc_position_set (&tmp, d->nleaves); - /* If we are a searching matcher, the default transition is to a state - containing the positions of state 0, otherwise the default transition - is to fail miserably. */ - if (d->searchflag) + if (group.nelem > 0) { - /* Find the state(s) corresponding to the positions of state 0. */ - copy (&d->states[0].elems, &follows); - separate_contexts = state_separate_contexts (&follows); - state = state_index (d, &follows, separate_contexts ^ CTX_ANY); - if (separate_contexts & CTX_NEWLINE) - state_newline = state_index (d, &follows, CTX_NEWLINE); - else - state_newline = state; - if (separate_contexts & CTX_LETTER) - state_letter = state_index (d, &follows, CTX_LETTER); - else - state_letter = state; - - for (i = 0; i < NOTCHAR; ++i) - trans[i] = unibyte_word_constituent (i) ? state_letter : state; - trans[d->syntax.eolbyte] = state_newline; - } - else - for (i = 0; i < NOTCHAR; ++i) - trans[i] = -1; + int possible_contexts; /* Contexts that the group can match. */ + int separate_contexts; /* Context that new state wants to know. */ - for (i = 0; i < ngrps; ++i) - { follows.nelem = 0; /* Find the union of the follows of the positions of the group. This is a hideously inefficient loop. Fix it someday. */ - for (j = 0; j < grps[i].nelem; ++j) - for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k) - insert (d->follows[grps[i].elems[j]].elems[k], &follows); + for (j = 0; j < group.nelem; ++j) + for (k = 0; k < d->follows[group.elems[j]].nelem; ++k) + insert (d->follows[group.elems[j]].elems[k], &follows); - if (d->multibyte) + /* If we are building a searching matcher, throw in the positions + of state 0 as well, if possible. */ + if (d->searchflag) { /* If a token in follows.elems is not 1st byte of a multibyte character, or the states of follows must accept the bytes which are not 1st byte of the multibyte character. - Then, if a state of follows encounter a byte, it must not be - a 1st byte of a multibyte character nor single byte character. - We cansel to add state[0].follows to next state, because - state[0] must accept 1st-byte - - For example, we assume <sb a> is a certain single byte - character, <mb A> is a certain multibyte character, and the - codepoint of <sb a> equals the 2nd byte of the codepoint of - <mb A>. - When state[0] accepts <sb a>, state[i] transit to state[i+1] - by accepting accepts 1st byte of <mb A>, and state[i+1] - accepts 2nd byte of <mb A>, if state[i+1] encounter the - codepoint of <sb a>, it must not be <sb a> but 2nd byte of - <mb A>, so we cannot add state[0]. */ - - next_isnt_1st_byte = false; - for (j = 0; j < follows.nelem; ++j) + Then, if a state of follows encounters a byte, it must not be + a 1st byte of a multibyte character nor a single byte character. + In this case, do not add state[0].follows to next state, because + state[0] must accept 1st-byte. + + For example, suppose <sb a> is a certain single byte character, + <mb A> is a certain multibyte character, and the codepoint of + <sb a> equals the 2nd byte of the codepoint of <mb A>. When + state[0] accepts <sb a>, state[i] transits to state[i+1] by + accepting the 1st byte of <mb A>, and state[i+1] accepts the + 2nd byte of <mb A>, if state[i+1] encounters the codepoint of + <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do + not add state[0]. */ + + bool mergeit = !d->localeinfo.multibyte; + if (!mergeit) + for (mergeit = true, j = 0; mergeit && j < follows.nelem; j++) + mergeit &= d->multibyte_prop[follows.elems[j].index]; + if (mergeit) { - if (!(d->multibyte_prop[follows.elems[j].index] & 1)) - { - next_isnt_1st_byte = true; - break; - } + merge (&d->states[0].elems, &follows, &tmp); + copy (&tmp, &follows); } } - /* If we are building a searching matcher, throw in the positions - of state 0 as well. */ - if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte)) - { - merge (&d->states[0].elems, &follows, &tmp); - copy (&tmp, &follows); - } - /* Find out if the new state will want any context information. */ - possible_contexts = charclass_context (d, labels[i]); + possible_contexts = charclass_context (d, label); separate_contexts = state_separate_contexts (&follows); /* Find the state(s) corresponding to the union of the follows. */ @@ -2919,45 +2748,39 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) state_letter = state_index (d, &follows, CTX_LETTER); else state_letter = state; + } -#ifdef DEBUG - fprintf (stderr, "group %zu\n nextpos:", i); - for (j = 0; j < grps[i].nelem; ++j) - { - fprintf (stderr, " %zu:", grps[i].elems[j]); - prtok (d->tokens[grps[i].elems[j]]); - } - fprintf (stderr, "\n follows:"); - for (j = 0; j < follows.nelem; ++j) + /* If we are a searching matcher, the default transition is to a state + containing the positions of state 0, otherwise the default transition + is to fail miserably. */ + else if (d->searchflag) + { + state_newline = 0; + state_letter = d->min_trcount - 1; + state = d->initstate_notbol; + } + else + { + state_newline = -1; + state_letter = -1; + state = -1; + } + + /* Set the transitions for each character in the label. */ + for (i = 0; i < NOTCHAR; i++) + if (tstbit (i, label)) + switch (d->syntax.sbit[i]) { - fprintf (stderr, " %zu:", follows.elems[j].index); - prtok (d->tokens[follows.elems[j].index]); + case CTX_NEWLINE: + trans[i] = state_newline; + break; + case CTX_LETTER: + trans[i] = state_letter; + break; + default: + trans[i] = state; + break; } - fprintf (stderr, "\n states:"); - if (possible_contexts & CTX_NEWLINE) - fprintf (stderr, " CTX_NEWLINE:%td", state_newline); - if (possible_contexts & CTX_LETTER) - fprintf (stderr, " CTX_LETTER:%td", state_letter); - if (possible_contexts & CTX_NONE) - fprintf (stderr, " CTX_NONE:%td", state); - fprintf (stderr, "\n"); -#endif - - /* Set the transitions for each character in the current label. */ - for (j = 0; j < CHARCLASS_WORDS; ++j) - for (k = 0; k < CHARCLASS_WORD_BITS; ++k) - if (labels[i][j] >> k & 1) - { - int c = j * CHARCLASS_WORD_BITS + k; - - if (c == d->syntax.eolbyte) - trans[c] = state_newline; - else if (unibyte_word_constituent (c)) - trans[c] = state_letter; - else if (c < NOTCHAR) - trans[c] = state; - } - } #ifdef DEBUG fprintf (stderr, "trans table %td", s); @@ -2970,10 +2793,19 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) fprintf (stderr, "\n"); #endif - for (i = 0; i < ngrps; ++i) - free (grps[i].elems); + free (group.elems); free (follows.elems); free (tmp.elems); + + /* Keep the newline transition in a special place so we can use it as + a sentinel. */ + if (tstbit (d->syntax.eolbyte, label)) + { + d->newlines[s] = trans[d->syntax.eolbyte]; + trans[d->syntax.eolbyte] = -1; + } + + return trans[uc]; } /* Make sure D's state arrays are large enough to hold NEW_STATE. */ @@ -2983,75 +2815,70 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state) state_num oldalloc = d->tralloc; if (oldalloc <= new_state) { - state_num **realtrans = d->trans ? d->trans - 1 : NULL; + state_num **realtrans = d->trans ? d->trans - 2 : NULL; size_t newalloc, newalloc1; - newalloc1 = new_state + 1; + newalloc1 = realtrans ? new_state + 2 : 0; realtrans = x2nrealloc (realtrans, &newalloc1, sizeof *realtrans); - realtrans[0] = NULL; - d->trans = realtrans + 1; - d->tralloc = newalloc = newalloc1 - 1; + realtrans[0] = realtrans[1] = NULL; + d->trans = realtrans + 2; + d->tralloc = newalloc = newalloc1 - 2; d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails); d->success = xnrealloc (d->success, newalloc, sizeof *d->success); d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines); - if (d->multibyte) + if (d->localeinfo.multibyte) { - realtrans = d->mb_trans ? d->mb_trans - 1 : NULL; + realtrans = d->mb_trans ? d->mb_trans - 2 : NULL; realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans); if (oldalloc == 0) - realtrans[0] = NULL; - d->mb_trans = realtrans + 1; + realtrans[0] = realtrans[1] = NULL; + d->mb_trans = realtrans + 2; } for (; oldalloc < newalloc; oldalloc++) { d->trans[oldalloc] = NULL; d->fails[oldalloc] = NULL; - if (d->multibyte) + if (d->localeinfo.multibyte) d->mb_trans[oldalloc] = NULL; } } } -/* Some routines for manipulating a compiled dfa's transition tables. - Each state may or may not have a transition table; if it does, and it - is a non-accepting state, then d->trans[state] points to its table. - If it is an accepting state then d->fails[state] points to its table. - If it has no table at all, then d->trans[state] is NULL. - TODO: Improve this comment, get rid of the unnecessary redundancy. */ +/* Calculate the transition table for a new state derived from state s + for a compiled dfa d after input character uc, and return the new + state number. */ -static void -build_state (state_num s, struct dfa *d) +static state_num +build_state (state_num s, struct dfa *d, unsigned char uc) { - state_num *trans; /* The new transition table. */ - state_num i, maxstate; - - /* Set an upper limit on the number of transition tables that will ever - exist at once. MAX_TRCOUNT is arbitrary. The idea is that the frequently - used transition tables will be quickly rebuilt, whereas the ones that - were only needed once or twice will be cleared away. However, do not - clear the initial D->min_trcount states, since they are always used. */ - if (MAX_TRCOUNT <= d->trcount) - { - for (i = d->min_trcount; i < d->tralloc; ++i) - { - free (d->trans[i]); - free (d->fails[i]); - d->trans[i] = d->fails[i] = NULL; - } - d->trcount = d->min_trcount; + /* A pointer to the new transition table, and the table itself. */ + state_num **ptrans = (ACCEPTING (s, *d) ? d->fails : d->trans) + s; + state_num *trans = *ptrans; - if (d->multibyte) + if (!trans) + { + /* MAX_TRCOUNT is an arbitrary upper limit on the number of + transition tables that can exist at once, other than for + initial states. Often-used transition tables are quickly + rebuilt, whereas rarely-used ones are cleared away. */ + if (MAX_TRCOUNT <= d->trcount) { - for (i = d->min_trcount; i < d->tralloc; i++) + for (state_num i = d->min_trcount; i < d->tralloc; i++) { - free (d->mb_trans[i]); - d->mb_trans[i] = NULL; + free (d->trans[i]); + free (d->fails[i]); + d->trans[i] = d->fails[i] = NULL; } - free (d->mb_trans[-1]); - d->mb_trans[-1] = NULL; + d->trcount = 0; } - } - ++d->trcount; + d->trcount++; + *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans); + + /* Fill transition table with a default value which means that the + transited state has not been calculated yet. */ + for (int i = 0; i < NOTCHAR; i++) + trans[i] = -2; + } /* Set up the success bits for this state. */ d->success[s] = 0; @@ -3062,27 +2889,18 @@ build_state (state_num s, struct dfa *d) if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d)) d->success[s] |= CTX_NONE; - trans = xmalloc (NOTCHAR * sizeof *trans); - dfastate (s, d, trans); + s = dfastate (s, d, uc, trans); /* Now go through the new transition table, and make sure that the trans and fail arrays are allocated large enough to hold a pointer for the largest state mentioned in the table. */ - maxstate = -1; - for (i = 0; i < NOTCHAR; ++i) + state_num maxstate = -1; + for (int i = 0; i < NOTCHAR; i++) if (maxstate < trans[i]) maxstate = trans[i]; realloc_trans_if_necessary (d, maxstate); - /* Keep the newline transition in a special place so we can use it as - a sentinel. */ - d->newlines[s] = trans[d->syntax.eolbyte]; - trans[d->syntax.eolbyte] = -1; - - if (ACCEPTING (s, *d)) - d->fails[s] = trans; - else - d->trans[s] = trans; + return s; } /* Multibyte character handling sub-routines for dfaexec. */ @@ -3096,23 +2914,13 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp) { state_num *t; - if (**pp == d->syntax.eolbyte) - { - /* S is always an initial state in transit_state, so the - transition table for the state must have been built already. */ - assert (d->trans[s] || d->fails[s]); - - ++*pp; - return d->newlines[s]; - } - if (d->trans[s]) t = d->trans[s]; else if (d->fails[s]) t = d->fails[s]; else { - build_state (s, d); + build_state (s, d, **pp); if (d->trans[s]) t = d->trans[s]; else @@ -3122,6 +2930,9 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp) } } + if (t[**pp] == -2) + build_state (s, d, **pp); + return t[*(*pp)++]; } @@ -3132,15 +2943,12 @@ static state_num transit_state (struct dfa *d, state_num s, unsigned char const **pp, unsigned char const *end) { - state_num s1; + state_num s1, s2; wint_t wc; int separate_contexts; - state_num state, state_newline, mb_index; - size_t i, j; + size_t i; int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); - int context = wc == d->syntax.eolbyte ? CTX_NEWLINE : CTX_NONE; - bool context_newline = context == CTX_NEWLINE; /* This state has some operators which can match a multibyte character. */ d->mb_follows.nelem = 0; @@ -3148,35 +2956,13 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, /* Calculate the state which can be reached from the state 's' by consuming 'mbclen' single bytes from the buffer. */ s1 = s; - for (i = 0; i < mbclen && 0 <= s; i++) + for (i = 0; i < mbclen && (i == 0 || d->min_trcount <= s); i++) s = transit_state_singlebyte (d, s, pp); *pp += mbclen - i; - if (d->states[s1].curr_dependent) + if (wc == WEOF) { - if (s < 0) - d->mb_follows.nelem = 0; - else - copy (&d->states[s].elems, &d->mb_follows); - - for (i = 0; i < d->states[s1].mbps.nelem; i++) - { - if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint, - d->states[s1].context, context)) - continue; - for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem; - j++) - insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j], - &d->mb_follows); - } - - separate_contexts = state_separate_contexts (&d->mb_follows); - if (context_newline && separate_contexts & CTX_NEWLINE) - s = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); - realloc_trans_if_necessary (d, s); - + /* It is an invalid character, so ANYCHAR is not accepted. */ return s; } @@ -3187,11 +2973,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, { if (MAX_TRCOUNT <= d->mb_trcount) { - state_num s2; - for (s2 = -1; s2 < d->tralloc; s2++) + state_num s3; + for (s3 = -1; s3 < d->tralloc; s3++) { - free (d->mb_trans[s2]); - d->mb_trans[s2] = NULL; + free (d->mb_trans[s3]); + d->mb_trans[s3] = NULL; } for (i = 0; i < d->sindex; i++) @@ -3201,40 +2987,29 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, d->states[s1].mb_trindex = d->mb_trcount++; } - mb_index = d->states[s1].mb_trindex * 2; - if (! d->mb_trans[s]) { enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] }; - enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE }; + enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE }; d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE); - for (i = 0; i < 2 * MAX_TRCOUNT; i++) + for (i = 0; i < MAX_TRCOUNT; i++) d->mb_trans[s][i] = -1; } - else - { - state = d->mb_trans[s][mb_index + context_newline]; - if (0 <= state) - return state; - } + else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0) + return d->mb_trans[s][d->states[s1].mb_trindex]; - if (s < 0) + if (s == -1) copy (&d->states[s1].mbps, &d->mb_follows); else merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows); separate_contexts = state_separate_contexts (&d->mb_follows); - state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); - if (separate_contexts & CTX_NEWLINE) - state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - state_newline = state; - realloc_trans_if_necessary (d, state_newline); + s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + realloc_trans_if_necessary (d, s2); - d->mb_trans[s][mb_index] = state; - d->mb_trans[s][mb_index + 1] = state_newline; + d->mb_trans[s][d->states[s1].mb_trindex] = s2; - return context_newline ? state_newline : state; + return s2; } /* The initial state may encounter a byte which is not a single byte character @@ -3254,16 +3029,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, Both P and MBP must be no larger than END. */ static unsigned char const * skip_remains_mb (struct dfa *d, unsigned char const *p, - unsigned char const *mbp, char const *end, wint_t *wcp) + unsigned char const *mbp, char const *end) { - wint_t wc = WEOF; + wint_t wc; if (d->syntax.never_trail[*p]) return p; while (mbp < p) mbp += mbs_to_wchar (&wc, (char const *) mbp, end - (char const *) mbp, d); - if (wcp != NULL) - *wcp = wc; return mbp; } @@ -3299,12 +3072,42 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, unsigned char saved_end; size_t nlcount = 0; - if (!d->tralloc) + if (MAX_TRCOUNT <= d->sindex) { - realloc_trans_if_necessary (d, 1); - build_state (0, d); + for (s = d->min_trcount; s < d->sindex; s++) + { + free (d->states[s].elems.elems); + free (d->states[s].mbps.elems); + } + d->sindex = d->min_trcount; + + if (d->trans) + { + for (s = 0; s < d->tralloc; s++) + { + free (d->trans[s]); + free (d->fails[s]); + d->trans[s] = d->fails[s] = NULL; + } + d->trcount = 0; + } + + if (d->localeinfo.multibyte && d->mb_trans) + { + for (s = -1; s < d->tralloc; s++) + { + free (d->mb_trans[s]); + d->mb_trans[s] = NULL; + } + for (s = 0; s < d->min_trcount; s++) + d->states[s].mb_trindex = -1; + d->mb_trcount = 0; + } } + if (!d->tralloc) + realloc_trans_if_necessary (d, 0); + s = s1 = 0; p = mbp = (unsigned char const *) begin; trans = d->trans; @@ -3320,51 +3123,25 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, for (;;) { - if (multibyte) + while ((t = trans[s]) != NULL) { - while ((t = trans[s]) != NULL) + if (s < d->min_trcount) { - s1 = s; - - if (s < d->min_trcount) + if (!multibyte || d->states[s].mbps.nelem == 0) { - if (d->min_trcount == 1) - { - if (d->states[s].mbps.nelem == 0) - { - do - { - while (t[*p] == 0) - p++; - p = mbp = skip_remains_mb (d, p, mbp, end, NULL); - } - while (t[*p] == 0); - } - else - p = mbp = skip_remains_mb (d, p, mbp, end, NULL); - } - else - { - wint_t wc; - mbp = skip_remains_mb (d, p, mbp, end, &wc); - - /* If d->min_trcount is greater than 1, maybe - transit to another initial state after skip. */ - if (p < mbp) - { - /* It's CTX_LETTER or CTX_NONE. CTX_NEWLINE - cannot happen, as we assume that a newline - is always a single byte character. */ - s1 = s = d->initstate_notbol; - p = mbp; - } - } + while (t[*p] == s) + p++; } + if (multibyte) + p = mbp = skip_remains_mb (d, p, mbp, end); + } - if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl) - || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + if (multibyte) + { + s1 = s; + + if (d->states[s].mbps.nelem == 0 + || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end) { /* If an input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3373,28 +3150,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } } - } - else - { - if (s == 0) - { - t = trans[s]; - if (t) - { - while (t[*p] == 0) - p++; - s1 = 0; - s = t[*p++]; - } - } - - while ((t = trans[s]) != NULL) + else { s1 = t[*p++]; t = trans[s1]; @@ -3405,36 +3165,54 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, s1 = tmp; /* swap */ break; } + if (s < d->min_trcount) + { + while (t[*p] == s1) + p++; + } s = t[*p++]; } } if (s < 0) { - if ((char *) p > end || p[-1] != eol || d->newlines[s1] < 0) + if (s == -2) + { + s = build_state (s1, d, p[-1]); + trans = d->trans; + } + else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1]) + { + /* The previous character was a newline. Count it, and skip + checking of multibyte character boundary until here. */ + nlcount++; + mbp = p; + + s = (allow_nl ? d->newlines[s1] + : d->syntax.sbit[eol] == CTX_NEWLINE ? 0 + : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1 + : d->initstate_notbol); + } + else { p = NULL; goto done; } - - /* The previous character was a newline, count it, and skip - checking of multibyte character boundary until here. */ - nlcount++; - mbp = p; - - s = allow_nl ? d->newlines[s1] : 0; } else if (d->fails[s]) { - if (d->success[s] & d->syntax.sbit[*p]) + if ((d->success[s] & d->syntax.sbit[*p]) + || ((char *) p == end + && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, + *d))) goto done; + if (multibyte && s < d->min_trcount) + p = mbp = skip_remains_mb (d, p, mbp, end); + s1 = s; if (!multibyte || d->states[s].mbps.nelem == 0 - || (*p == eol && !allow_nl) - || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end) { /* If a input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3443,15 +3221,13 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } } else { - build_state (s, d); + build_state (s, d, p[0]); trans = d->trans; } } @@ -3490,7 +3266,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end, return (char *) begin; } -/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte), +/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte), but faster and set *BACKREF if the DFA code does not support this regexp usage. */ @@ -3531,7 +3307,7 @@ free_mbdata (struct dfa *d) state_num s; for (s = -1; s < d->tralloc; s++) free (d->mb_trans[s]); - free (d->mb_trans - 1); + free (d->mb_trans - 2); } } @@ -3548,7 +3324,7 @@ dfa_supported (struct dfa const *d) case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (!d->multibyte) + if (!d->localeinfo.multibyte) continue; /* fallthrough */ @@ -3566,7 +3342,7 @@ dfaoptimize (struct dfa *d) size_t i; bool have_backref = false; - if (!using_utf8) + if (!d->localeinfo.using_utf8) return; for (i = 0; i < d->tindex; ++i) @@ -3596,7 +3372,7 @@ dfaoptimize (struct dfa *d) } free_mbdata (d); - d->multibyte = false; + d->localeinfo.multibyte = false; d->dfaexec = dfaexec_sb; d->fast = true; } @@ -3605,13 +3381,12 @@ static void dfassbuild (struct dfa *d) { size_t i, j; - charclass ccl; bool have_achar = false; bool have_nchar = false; struct dfa *sup = dfaalloc (); *sup = *d; - sup->multibyte = false; + sup->localeinfo.multibyte = false; sup->dfaexec = dfaexec_sb; sup->multibyte_prop = NULL; sup->mbcsets = NULL; @@ -3642,26 +3417,29 @@ dfassbuild (struct dfa *d) case ANYCHAR: case MBCSET: case BACKREF: - zeroset (ccl); - notset (ccl); - sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl); - sup->tokens[j++] = STAR; - if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR - || d->tokens[i + 1] == PLUS) - i++; - have_achar = true; + { + charclass ccl; + fillset (ccl); + sup->tokens[j++] = CSET + charclass_index (sup, ccl); + sup->tokens[j++] = STAR; + if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR + || d->tokens[i + 1] == PLUS) + i++; + have_achar = true; + } break; case BEGWORD: case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (d->multibyte) + if (d->localeinfo.multibyte) { /* These constraints aren't supported in a multibyte locale. Ignore them in the superset DFA. */ sup->tokens[j++] = EMPTY; break; } + /* fallthrough */ default: sup->tokens[j++] = d->tokens[i]; if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR) @@ -3672,7 +3450,7 @@ dfassbuild (struct dfa *d) } sup->tindex = j; - if (have_nchar && (have_achar || d->multibyte)) + if (have_nchar && (have_achar || d->localeinfo.multibyte)) d->superset = sup; else { @@ -3714,7 +3492,7 @@ dfafree (struct dfa *d) free (d->charclasses); free (d->tokens); - if (d->multibyte) + if (d->localeinfo.multibyte) free_mbdata (d); for (i = 0; i < d->sindex; ++i) @@ -3739,7 +3517,7 @@ dfafree (struct dfa *d) free (d->fails[i]); } - free (d->trans - 1); + free (d->trans - 2); free (d->fails); free (d->newlines); free (d->success); @@ -4238,20 +4016,56 @@ dfamustfree (struct dfamust *dm) struct dfa * dfaalloc (void) { - struct dfa *d = xcalloc (1, sizeof (struct dfa)); - d->multibyte = MB_CUR_MAX > 1; - d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; - d->fast = !d->multibyte; - d->lexstate.cur_mb_len = 1; - return d; + void *p = xmalloc (sizeof (struct dfa)); + if (p) + { + memset (p, 0, sizeof (struct dfa)); + } + return p; } +/* Initialize DFA. */ void -dfa_init (void) +dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, + reg_syntax_t bits, int dfaopts) { - check_utf8 (); - check_unibyte_c (); - init_mbrtowc_cache (); + int i; + memset (dfa, 0, offsetof (struct dfa, dfaexec)); + dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb; + dfa->simple_locale = using_simple_locale (linfo->multibyte); + dfa->localeinfo = *linfo; + + dfa->fast = !dfa->localeinfo.multibyte; + + dfa->canychar = -1; + dfa->lex.cur_mb_len = 1; + dfa->syntax.syntax_bits_set = true; + dfa->syntax.case_fold = (bits & RE_ICASE) != 0; + dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0; + dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n'; + dfa->syntax.syntax_bits = bits; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + unsigned char uc = i; + + dfa->syntax.sbit[uc] = char_context (dfa, uc); + switch (dfa->syntax.sbit[uc]) + { + case CTX_LETTER: + setbit (uc, dfa->syntax.letters); + break; + case CTX_NEWLINE: + setbit (uc, dfa->syntax.newline); + break; + } + + /* POSIX requires that the five bytes in "\n\r./" (including the + terminating NUL) cannot occur inside a multibyte character. */ + dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8 + ? (uc & 0xc0) != 0x80 + : strchr ("\n\r./", uc) != NULL); + } } /* vim:set shiftwidth=2: */ |