diff options
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | dfa.c | 1425 | ||||
-rw-r--r-- | dfa.h | 28 |
3 files changed, 700 insertions, 757 deletions
@@ -1,3 +1,7 @@ +2014-04-25 Arnold D. Robbins <arnold@skeeve.com> + + * dfa.h, dfa.c: Merge with GNU grep; lots of forward motion. + 2014-04-24 Arnold D. Robbins <arnold@skeeve.com> Update xalloc.h for pending merge with dfa. @@ -37,21 +37,11 @@ #if HAVE_SETLOCALE #include <locale.h> #endif -#ifdef HAVE_STDBOOL_H -#include <stdbool.h> -#else -#include "missing_d/gawkbool.h" -#endif /* HAVE_STDBOOL_H */ -/* Gawk doesn't use Gnulib, so don't assume that setlocale and - static_assert are present. */ +/* Gawk doesn't use Gnulib, so don't assume that setlocale is present. */ #ifndef LC_ALL # define setlocale(category, locale) NULL #endif -#ifndef static_assert -# define static_assert(cond, diagnostic) \ - extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] -#endif #define STREQ(a, b) (strcmp (a, b) == 0) @@ -85,10 +75,6 @@ #endif #endif /* GAWK */ -#if HAVE_LANGINFO_CODESET -# include <langinfo.h> -#endif - #include "xalloc.h" #include "dfa.h" @@ -338,7 +324,8 @@ typedef struct size_t hash; /* Hash of the positions of this state. */ position_set elems; /* Positions this state could match. */ unsigned char context; /* Context from previous state. */ - char backref; /* True if this state matches a \<digit>. */ + bool has_backref; /* True if this state matches a \<digit>. */ + bool has_mbcset; /* True if this state matches a MBCSET. */ unsigned short constraint; /* Constraint for this state to accept. */ token first_end; /* Token value of the first END in elems. */ position_set mbps; /* Positions which can match multibyte @@ -355,13 +342,16 @@ typedef ptrdiff_t state_num; struct mb_char_classes { ptrdiff_t cset; - int invert; + bool invert; wchar_t *chars; /* Normal characters. */ size_t nchars; wctype_t *ch_classes; /* Character classes. */ size_t nch_classes; - wchar_t *range_sts; /* Range characters (start of the range). */ - wchar_t *range_ends; /* Range characters (end of the range). */ + struct /* Range characters. */ + { + wchar_t beg; /* Range start. */ + wchar_t end; /* Range end. */ + } *ranges; size_t nranges; char **equivs; /* Equivalence classes. */ size_t nequivs; @@ -387,10 +377,11 @@ struct dfa size_t nleaves; /* Number of leaves on the parse tree. */ size_t nregexps; /* Count of parallel regexps being built with dfaparse. */ - unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */ + bool multibyte; /* True iff MB_CUR_MAX > 1. */ token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ + mbstate_t mbs; /* Multibyte conversion state. */ - /* The following are used only if MB_CUR_MAX > 1. */ + /* The following are valid only if MB_CUR_MAX > 1. */ /* The value of multibyte_prop[i] is defined by following rule. if tokens[i] < NOTCHAR @@ -409,7 +400,6 @@ struct dfa multibyte_prop = 3 , 1 , 0 , 2 , 3 */ - size_t nmultibyte_prop; int *multibyte_prop; #if MBS_SUPPORT @@ -425,10 +415,13 @@ struct dfa size_t nmbcsets; size_t mbcsets_alloc; + /* Fields filled by the superset. */ + struct dfa *superset; /* Hint of the dfa. */ + /* Fields filled by the state builder. */ dfa_state *states; /* States of the dfa. */ state_num sindex; /* Index for adding new states. */ - state_num salloc; /* Number of states currently allocated. */ + size_t salloc; /* Number of states currently allocated. */ /* Fields filled by the parse tree->NFA conversion. */ position_set *follows; /* Array of follow sets, indexed by position @@ -438,7 +431,7 @@ struct dfa matching the given position in a string matching the regexp. Allocated to the maximum possible position index. */ - int searchflag; /* True if we are supposed to build a searching + bool searchflag; /* True if we are supposed to build a searching as opposed to an exact matcher. A searching matcher finds the first and shortest string matching a regexp anywhere in the buffer, @@ -448,16 +441,16 @@ struct dfa /* Fields filled by dfaexec. */ state_num tralloc; /* Number of transition tables that have - slots so far. */ + slots so far, not counting trans[-1]. */ int trcount; /* Number of transition tables that have actually been built. */ state_num **trans; /* Transition tables for states that can never accept. If the transitions for a state have not yet been computed, or the state could possibly accept, its entry in - this table is NULL. */ - state_num **realtrans; /* Trans always points to realtrans + 1; this - is so trans[-1] can contain NULL. */ + this table is NULL. This points to one + past the start of the allocated array, + and trans[-1] is always NULL. */ state_num **fails; /* Transition tables after failing to accept on a state that potentially could do so. */ int *success; /* Table of acceptance conditions used in @@ -472,6 +465,11 @@ struct dfa struct dfamust *musts; /* List of strings, at least one of which is known to appear in any r.e. matching the dfa. */ + position_set mb_follows; /* Follow set added by ANYCHAR and/or MBCSET + on demand. */ + int *mb_match_lens; /* Array of length reduced by ANYCHAR and/or + MBCSET. Null if mb_follows.elems has not + been allocated. */ }; /* Some macros for user access to dfa internals. */ @@ -487,41 +485,6 @@ struct dfa static void dfamust (struct dfa *dfa); static void regexp (void); -/* These two macros are identical to the ones in gnulib's xalloc.h, - except that they do not cast the result to "(t *)", and thus may - be used via type-free CALLOC and MALLOC macros. */ -#undef XNMALLOC -#undef XCALLOC - -/* Allocate memory for N elements of type T, with error checking. */ -/* extern t *XNMALLOC (size_t n, typename t); */ -# define XNMALLOC(n, t) \ - (sizeof (t) == 1 ? xmalloc (n) : xnmalloc (n, sizeof (t))) - -/* Allocate memory for N elements of type T, with error checking, - and zero it. */ -/* extern t *XCALLOC (size_t n, typename t); */ -# define XCALLOC(n, t) \ - (sizeof (t) == 1 ? xzalloc (n) : xcalloc (n, sizeof (t))) - -#define CALLOC(p, n) do { (p) = XCALLOC (n, *(p)); } while (0) -#undef MALLOC /* Irix defines this */ -#define MALLOC(p, n) do { (p) = XNMALLOC (n, *(p)); } while (0) -#define REALLOC(p, n) do {(p) = xnrealloc (p, n, sizeof (*(p))); } while (0) - -/* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */ -#define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \ - do \ - { \ - if ((n_alloc) <= (n_required)) \ - { \ - size_t new_n_alloc = (n_required) + !(p); \ - (p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p))); \ - (n_alloc) = new_n_alloc; \ - } \ - } \ - while (false) - static void dfambcache (struct dfa *d) { @@ -546,33 +509,34 @@ dfambcache (struct dfa *d) } #if MBS_SUPPORT -/* Given the dfa D, store into *PWC the result of converting the - leading bytes of the multibyte buffer S of length N bytes, updating - the conversion state in *MBS. On conversion error, convert just a - single byte as-is. Return the number of bytes converted. +/* Store into *PWC the result of converting the leading bytes of the + multibyte buffer S of length N bytes, using the mbrtowc_cache in *D + and updating the conversion state in *D. On conversion error, + convert just a single byte as-is. Return the number of bytes + converted. - This differs from mbrtowc (PWC, S, N, MBS) as follows: + This differs from mbrtowc (PWC, S, N, &D->mbs) as follows: - * Extra arg D, containing an mbrtowc_cache for speed. + * The last arg is a dfa *D instead of merely a multibyte conversion + state D->mbs. D also contains an mbrtowc_cache for speed. * N must be at least 1. * S[N - 1] must be a sentinel byte. * Shift encodings are not supported. * The return value is always in the range 1..N. - * *MBS is always valid afterwards. + * D->mbs is always valid afterwards. * *PWC is always set to something. */ static size_t -mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n, - mbstate_t *mbs) +mbs_to_wchar (wchar_t *pwc, char const *s, size_t n, struct dfa *d) { unsigned char uc = s[0]; wint_t wc = d->mbrtowc_cache[uc]; if (wc == WEOF) { - size_t nbytes = mbrtowc (pwc, s, n, mbs); + size_t nbytes = mbrtowc (pwc, s, n, &d->mbs); if (0 < nbytes && nbytes < (size_t) -2) return nbytes; - memset (mbs, 0, sizeof *mbs); + memset (&d->mbs, 0, sizeof d->mbs); wc = uc; } @@ -700,36 +664,60 @@ notset (charclass s) s[i] = ~s[i]; } -static int +static bool equal (charclass const s1, charclass const s2) { return memcmp (s1, s2, sizeof (charclass)) == 0; } -/* A pointer to the current dfa is kept here during parsing. */ -static struct dfa *dfa; +/* Ensure that the array addressed by PTR holds at least NITEMS + + (PTR || !NITEMS) items. Either return PTR, or reallocate the array + and return its new address. Although PTR may be null, the returned + value is never null. + + The array holds *NALLOC items; *NALLOC must be zero if PTR is null, + and is updated on reallocation. ITEMSIZE is the size of one item. + Avoid O(N**2) behavior on arrays growing linearly. */ +static void * +maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) +{ + if (nitems < *nalloc) + return ptr; + *nalloc = nitems; + return x2nrealloc (ptr, nalloc, itemsize); +} -/* Find the index of charclass s in dfa->charclasses, or allocate a - new charclass. */ +/* In DFA D, find the index of charclass S, or allocate a new one. */ static size_t -charclass_index (charclass const s) +dfa_charclass_index (struct dfa *d, charclass const s) { size_t i; - for (i = 0; i < dfa->cindex; ++i) - if (equal (s, dfa->charclasses[i])) + for (i = 0; i < d->cindex; ++i) + if (equal (s, d->charclasses[i])) return i; - REALLOC_IF_NECESSARY (dfa->charclasses, dfa->calloc, dfa->cindex + 1); - ++dfa->cindex; - copyset (s, dfa->charclasses[i]); + d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc, + sizeof *d->charclasses); + ++d->cindex; + copyset (s, d->charclasses[i]); return i; } +/* A pointer to the current dfa is kept here during parsing. */ +static struct dfa *dfa; + +/* Find the index of charclass S in the current DFA, or allocate a new one. */ +static size_t +charclass_index (charclass const s) +{ + return dfa_charclass_index (dfa, s); +} + /* Syntax bits controlling the behavior of the lexical analyzer. */ static reg_syntax_t syntax_bits, syntax_bits_set; /* Flag for case-folding letters into sets. */ -static int case_fold; +static bool case_fold; /* End-of-line byte in data. */ static unsigned char eolbyte; @@ -759,7 +747,7 @@ static charclass newline; static int char_context (unsigned char c) { - if (c == eolbyte || c == 0) + if (c == eolbyte) return CTX_NEWLINE; if (IS_WORD_CONSTITUENT (c)) return CTX_LETTER; @@ -784,7 +772,7 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) syntax_bits_set = 1; syntax_bits = bits; - case_fold = fold; + case_fold = fold != 0; eolbyte = eol; for (i = 0; i < NOTCHAR; ++i) @@ -843,19 +831,16 @@ int using_utf8 (void) { static int utf8 = -1; - if (utf8 == -1) + if (utf8 < 0) { -#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT - utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8")); -#else - utf8 = 0; -#endif + wchar_t wc; + mbstate_t mbs = { 0 }; + utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; #ifdef LIBC_IS_BORKED if (gawk_mb_cur_max == 1) utf8 = 0; #endif } - return utf8; } @@ -907,7 +892,7 @@ using_simple_locale (void) static char const *lexptr; /* Pointer to next input character. */ static size_t lexleft; /* Number of characters remaining. */ static token lasttok; /* Previous token returned; initially END. */ -static int laststart; /* True if we're separated from beginning or (, +static bool laststart; /* True if we're separated from beginning or (, | only by zero-width characters. */ static size_t parens; /* Count of outstanding left parens. */ static int minrep, maxrep; /* Repeat counts for {m,n}. */ @@ -915,27 +900,8 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */ static int cur_mb_len = 1; /* Length of the multibyte representation of wctok. */ /* These variables are used only if (MB_CUR_MAX > 1). */ -static mbstate_t mbs; /* mbstate for mbrtowc. */ static wchar_t wctok; /* Wide character representation of the current multibyte character. */ -static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec. - Each element stores the number of remaining - bytes of the corresponding multibyte - character in the input string. A element's - value is 0 if the corresponding character is - single-byte. - e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)> - mblen_buf : 0, 3, 2, 1 - */ -static wchar_t *inputwcs; /* Wide character representation of the input - string in dfaexec. - The length of this array is the same as - the length of input string (char array). - inputstring[i] is a single-byte char, - or the first byte of a multibyte char; - inputwcs[i] is the codepoint. */ -static unsigned char const *buf_begin; /* reference to begin in dfaexec. */ -static unsigned char const *buf_end; /* reference to end in dfaexec. */ #if MBS_SUPPORT @@ -952,7 +918,7 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */ else \ { \ wchar_t _wc; \ - size_t nbytes = mbs_to_wchar (dfa, &_wc, lexptr, lexleft, &mbs); \ + size_t nbytes = mbs_to_wchar (&_wc, lexptr, lexleft, dfa); \ cur_mb_len = nbytes; \ (wc) = _wc; \ (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF; \ @@ -999,14 +965,17 @@ static short const lonesome_lower[] = 0x03F5, 0x1E9B, 0x1FBE, }; -static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2 - == CASE_FOLDED_BUFSIZE), - "CASE_FOLDED_BUFSIZE is wrong"); +/* Maximum number of characters that can be the case-folded + counterparts of a single character, not counting the character + itself. This is 1 for towupper, 1 for towlower, and 1 for each + entry in LONESOME_LOWER. */ +enum +{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower }; /* Find the characters equal to C after case-folding, other than C itself, and store them into FOLDED. Return the number of characters stored. */ -int +static int case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) { int i; @@ -1071,7 +1040,7 @@ find_pred (const char *str) static token parse_bracket_exp (void) { - int invert; + bool invert; int c, c1, c2; charclass ccl; @@ -1092,16 +1061,14 @@ parse_bracket_exp (void) /* Work area to build a mb_char_classes. */ struct mb_char_classes *work_mbc; - size_t chars_al, range_sts_al, range_ends_al, ch_classes_al, - equivs_al, coll_elems_al; + size_t chars_al, ranges_al, ch_classes_al, equivs_al, coll_elems_al; - chars_al = 0; - range_sts_al = range_ends_al = 0; - ch_classes_al = equivs_al = coll_elems_al = 0; - if (MB_CUR_MAX > 1) + chars_al = ranges_al = ch_classes_al = equivs_al = coll_elems_al = 0; + if (dfa->multibyte) { - REALLOC_IF_NECESSARY (dfa->mbcsets, dfa->mbcsets_alloc, - dfa->nmbcsets + 1); + dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets, + &dfa->mbcsets_alloc, + sizeof *dfa->mbcsets); /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. We will update dfa->multibyte_prop[] in addtok, because we can't @@ -1119,11 +1086,11 @@ parse_bracket_exp (void) if (c == '^') { FETCH_WC (c, wc, _("unbalanced [")); - invert = 1; + invert = true; known_bracket_exp = using_simple_locale (); } else - invert = 0; + invert = false; colon_warning_state = (c == ':'); do @@ -1173,14 +1140,15 @@ parse_bracket_exp (void) if (!pred) dfaerror (_("invalid character class")); - if (MB_CUR_MAX > 1 && !pred->single_byte_only) + if (dfa->multibyte && !pred->single_byte_only) { /* Store the character class as wctype_t. */ wctype_t wt = (wctype_t) wctype (class); - REALLOC_IF_NECESSARY (work_mbc->ch_classes, - ch_classes_al, - work_mbc->nch_classes + 1); + work_mbc->ch_classes + = maybe_realloc (work_mbc->ch_classes, + work_mbc->nch_classes, &ch_classes_al, + sizeof *work_mbc->ch_classes); work_mbc->ch_classes[work_mbc->nch_classes++] = wt; } @@ -1227,29 +1195,25 @@ parse_bracket_exp (void) if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) FETCH_WC (c2, wc2, _("unbalanced [")); - if (MB_CUR_MAX > 1) + if (dfa->multibyte) { /* When case folding map a range, say [m-z] (or even [M-z]) to the pair of ranges, [m-z] [M-Z]. Although this code is wrong in multiple ways, it's never used in practice. FIXME: Remove this (and related) unused code. */ - REALLOC_IF_NECESSARY (work_mbc->range_sts, - range_sts_al, work_mbc->nranges + 1); - REALLOC_IF_NECESSARY (work_mbc->range_ends, - range_ends_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = - case_fold ? towlower (wc) : (wchar_t) wc; - work_mbc->range_ends[work_mbc->nranges++] = - case_fold ? towlower (wc2) : (wchar_t) wc2; + work_mbc->ranges + = maybe_realloc (work_mbc->ranges, work_mbc->nranges + 2, + &ranges_al, sizeof *work_mbc->ranges); + work_mbc->ranges[work_mbc->nranges].beg + = case_fold ? towlower (wc) : wc; + work_mbc->ranges[work_mbc->nranges++].end + = case_fold ? towlower (wc2) : wc2; if (case_fold && (iswalpha (wc) || iswalpha (wc2))) { - REALLOC_IF_NECESSARY (work_mbc->range_sts, - range_sts_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = towupper (wc); - REALLOC_IF_NECESSARY (work_mbc->range_ends, - range_ends_al, work_mbc->nranges + 1); - work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); + work_mbc->ranges[work_mbc->nranges].beg = towupper (wc); + work_mbc->ranges[work_mbc->nranges++].end + = towupper (wc2); } } else if (using_simple_locale ()) @@ -1284,7 +1248,7 @@ parse_bracket_exp (void) colon_warning_state |= (c == ':') ? 2 : 4; - if (MB_CUR_MAX == 1) + if (!dfa->multibyte) { if (case_fold) setbit_case_fold_c (c, ccl); @@ -1297,16 +1261,17 @@ parse_bracket_exp (void) { wchar_t folded[CASE_FOLDED_BUFSIZE]; int i, n = case_folded_counterparts (wc, folded); - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + n); + work_mbc->chars = maybe_realloc (work_mbc->chars, + work_mbc->nchars + n, &chars_al, + sizeof *work_mbc->chars); for (i = 0; i < n; i++) if (!setbit_wc (folded[i], ccl)) work_mbc->chars[work_mbc->nchars++] = folded[i]; } if (!setbit_wc (wc, ccl)) { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); + work_mbc->chars = maybe_realloc (work_mbc->chars, work_mbc->nchars, + &chars_al, sizeof *work_mbc->chars); work_mbc->chars[work_mbc->nchars++] = wc; } } @@ -1318,7 +1283,7 @@ parse_bracket_exp (void) if (! known_bracket_exp) return BACKREF; - if (MB_CUR_MAX > 1) + if (dfa->multibyte) { static charclass zeroclass; work_mbc->invert = invert; @@ -1328,7 +1293,7 @@ parse_bracket_exp (void) if (invert) { - assert (MB_CUR_MAX == 1); + assert (!dfa->multibyte); notset (ccl); if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit (eolbyte, ccl); @@ -1341,7 +1306,7 @@ static token lex (void) { unsigned int c, c2; - int backslash = 0; + bool backslash = false; charclass ccl; int i; @@ -1364,7 +1329,7 @@ lex (void) goto normal_char; if (lexleft == 0) dfaerror (_("unfinished \\ escape")); - backslash = 1; + backslash = true; break; case '^': @@ -1402,7 +1367,7 @@ lex (void) case '9': if (backslash && !(syntax_bits & RE_NO_BK_REFS)) { - laststart = 0; + laststart = false; return lasttok = BACKREF; } goto normal_char; @@ -1517,7 +1482,7 @@ lex (void) lexptr = p; lexleft = lim - p; } - laststart = 0; + laststart = false; return lasttok = REPMN; case '|': @@ -1525,21 +1490,21 @@ lex (void) goto normal_char; if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) goto normal_char; - laststart = 1; + laststart = true; return lasttok = OR; case '\n': if (syntax_bits & RE_LIMITED_OPS || backslash || !(syntax_bits & RE_NEWLINE_ALT)) goto normal_char; - laststart = 1; + laststart = true; return lasttok = OR; case '(': if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; ++parens; - laststart = 1; + laststart = true; return lasttok = LPAREN; case ')': @@ -1548,17 +1513,17 @@ lex (void) if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; --parens; - laststart = 0; + laststart = false; return lasttok = RPAREN; case '.': if (backslash) goto normal_char; - if (MB_CUR_MAX > 1) + if (dfa->multibyte) { /* In multibyte environment period must match with a single character not a byte. So we use ANYCHAR. */ - laststart = 0; + laststart = false; return lasttok = ANYCHAR; } zeroset (ccl); @@ -1567,14 +1532,14 @@ lex (void) clrbit (eolbyte, ccl); if (syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', ccl); - laststart = 0; + laststart = false; return lasttok = CSET + charclass_index (ccl); case 's': case 'S': if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (MB_CUR_MAX == 1) + if (!dfa->multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1582,7 +1547,7 @@ lex (void) setbit (c2, ccl); if (c == 'S') notset (ccl); - laststart = 0; + laststart = false; return lasttok = CSET + charclass_index (ccl); } @@ -1612,7 +1577,7 @@ lex (void) POP_LEX_STATE (); - laststart = 0; + laststart = false; return lasttok; case 'w': @@ -1625,21 +1590,21 @@ lex (void) setbit (c2, ccl); if (c == 'W') notset (ccl); - laststart = 0; + laststart = false; return lasttok = CSET + charclass_index (ccl); case '[': if (backslash) goto normal_char; - laststart = 0; + laststart = false; return lasttok = parse_bracket_exp (); default: normal_char: - laststart = 0; + laststart = false; /* For multibyte character sets, folding is done in atom. Always return WCHAR. */ - if (MB_CUR_MAX > 1) + if (dfa->multibyte) return lasttok = WCHAR; if (case_fold && isalpha (c)) @@ -1671,14 +1636,16 @@ static size_t depth; /* Current depth of a hypothetical stack static void addtok_mb (token t, int mbprop) { - if (MB_CUR_MAX > 1) + if (dfa->talloc == dfa->tindex) { - REALLOC_IF_NECESSARY (dfa->multibyte_prop, dfa->nmultibyte_prop, - dfa->tindex + 1); - dfa->multibyte_prop[dfa->tindex] = mbprop; + dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc, + sizeof *dfa->tokens); + if (dfa->multibyte) + dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc, + sizeof *dfa->multibyte_prop); } - - REALLOC_IF_NECESSARY (dfa->tokens, dfa->talloc, dfa->tindex + 1); + if (dfa->multibyte) + dfa->multibyte_prop[dfa->tindex] = mbprop; dfa->tokens[dfa->tindex++] = t; switch (t) @@ -1710,7 +1677,7 @@ static void addtok_wc (wint_t wc); static void addtok (token t) { - if (MB_CUR_MAX > 1 && t == MBCSET) + if (dfa->multibyte && t == MBCSET) { bool need_or = false; struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1]; @@ -1967,7 +1934,7 @@ copytoks (size_t tindex, size_t ntokens) { size_t i; - if (MB_CUR_MAX > 1) + if (dfa->multibyte) for (i = 0; i < ntokens; ++i) addtok_mb (dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]); else @@ -2050,12 +2017,12 @@ dfaparse (char const *s, size_t len, struct dfa *d) lexptr = s; lexleft = len; lasttok = END; - laststart = 1; + laststart = true; parens = 0; - if (MB_CUR_MAX > 1) + if (dfa->multibyte) { cur_mb_len = 0; - memset (&mbs, 0, sizeof mbs); + memset (&d->mbs, 0, sizeof d->mbs); } if (!syntax_bits_set) @@ -2080,19 +2047,24 @@ dfaparse (char const *s, size_t len, struct dfa *d) /* Some primitives for operating on sets of positions. */ -/* Copy one set to another; the destination must be large enough. */ +/* Copy one set to another. */ static void copy (position_set const *src, position_set * dst) { - REALLOC_IF_NECESSARY (dst->elems, dst->alloc, src->nelem); - memcpy (dst->elems, src->elems, sizeof (dst->elems[0]) * src->nelem); + if (dst->alloc < src->nelem) + { + free (dst->elems); + dst->alloc = src->nelem; + dst->elems = x2nrealloc (NULL, &dst->alloc, sizeof *dst->elems); + } + memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems); dst->nelem = src->nelem; } static void alloc_position_set (position_set * s, size_t size) { - MALLOC (s->elems, size); + s->elems = xnmalloc (size, sizeof *s->elems); s->alloc = size; s->nelem = 0; } @@ -2122,7 +2094,7 @@ insert (position p, position_set * s) return; } - REALLOC_IF_NECESSARY (s->elems, s->alloc, count + 1); + s->elems = maybe_realloc (s->elems, count, &s->alloc, sizeof *s->elems); for (i = count; i > lo; i--) s->elems[i] = s->elems[i - 1]; s->elems[lo] = p; @@ -2136,7 +2108,12 @@ merge (position_set const *s1, position_set const *s2, position_set * m) { size_t i = 0, j = 0; - REALLOC_IF_NECESSARY (m->elems, m->alloc, s1->nelem + s2->nelem); + if (m->alloc < s1->nelem + s2->nelem) + { + free (m->elems); + m->elems = maybe_realloc (NULL, s1->nelem + s2->nelem, &m->alloc, + sizeof *m->elems); + } m->nelem = 0; while (i < s1->nelem && j < s2->nelem) if (s1->elems[i].index > s2->elems[j].index) @@ -2197,19 +2174,19 @@ state_index (struct dfa *d, position_set const *s, int context) } /* We'll have to create a new state. */ - REALLOC_IF_NECESSARY (d->states, d->salloc, d->sindex + 1); + d->states = maybe_realloc (d->states, d->sindex, &d->salloc, + sizeof *d->states); d->states[i].hash = hash; alloc_position_set (&d->states[i].elems, s->nelem); copy (s, &d->states[i].elems); d->states[i].context = context; - d->states[i].backref = 0; + d->states[i].has_backref = false; + d->states[i].has_mbcset = false; d->states[i].constraint = 0; d->states[i].first_end = 0; - if (MBS_SUPPORT) - { - d->states[i].mbps.nelem = 0; - d->states[i].mbps.elems = NULL; - } + d->states[i].mbps.nelem = 0; + d->states[i].mbps.elems = NULL; + for (j = 0; j < s->nelem; ++j) if (d->tokens[s->elems[j].index] < 0) { @@ -2222,7 +2199,7 @@ state_index (struct dfa *d, position_set const *s, int context) else if (d->tokens[s->elems[j].index] == BACKREF) { d->states[i].constraint = NO_CONSTRAINT; - d->states[i].backref = 1; + d->states[i].has_backref = true; } ++d->sindex; @@ -2239,10 +2216,10 @@ static void epsclosure (position_set * s, struct dfa const *d) { size_t i, j; - char *visited; /* Array of booleans, enough to use char, not int. */ position p, old; - CALLOC (visited, d->tindex); + /* Array of booleans, large enough to use char, not int. */ + char *visited = xzalloc (d->tindex); for (i = 0; i < s->nelem; ++i) if (d->tokens[s->elems[i].index] >= NOTCHAR @@ -2399,17 +2376,26 @@ state_separate_contexts (position_set const *s) void dfaanalyze (struct dfa *d, int searchflag) { - int *nullable; /* Nullable stack. */ - size_t *nfirstpos; /* Element count stack for firstpos sets. */ - position *firstpos; /* Array where firstpos elements are stored. */ - size_t *nlastpos; /* Element count stack for lastpos sets. */ - position *lastpos; /* Array where lastpos elements are stored. */ + /* Array allocated to hold position sets. */ + position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc); + /* Firstpos and lastpos elements. */ + position *firstpos = posalloc + d->nleaves; + position *lastpos = firstpos + d->nleaves; + + /* Stack for element counts and nullable flags. */ + struct + { + /* Whether the entry is nullable. */ + bool nullable; + + /* Counts of firstpos and lastpos sets. */ + size_t nfirstpos; + size_t nlastpos; + } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc; + position_set tmp; /* Temporary set for merging sets. */ position_set merged; /* Result of merging sets. */ int separate_contexts; /* Context wanted by some position. */ - int *o_nullable; - size_t *o_nfirst, *o_nlast; - position *o_firstpos, *o_lastpos; size_t i, j; position *pos; @@ -2423,21 +2409,9 @@ dfaanalyze (struct dfa *d, int searchflag) putc ('\n', stderr); #endif - d->searchflag = searchflag; - - MALLOC (nullable, d->depth); - o_nullable = nullable; - MALLOC (nfirstpos, d->depth); - o_nfirst = nfirstpos; - MALLOC (firstpos, d->nleaves); - o_firstpos = firstpos, firstpos += d->nleaves; - MALLOC (nlastpos, d->depth); - o_nlast = nlastpos; - MALLOC (lastpos, d->nleaves); - o_lastpos = lastpos, lastpos += d->nleaves; + d->searchflag = searchflag != 0; alloc_position_set (&merged, d->nleaves); - - CALLOC (d->follows, d->tindex); + d->follows = xcalloc (d->tindex, sizeof *d->follows); for (i = 0; i < d->tindex; ++i) { @@ -2445,20 +2419,21 @@ dfaanalyze (struct dfa *d, int searchflag) { case EMPTY: /* The empty set is nullable. */ - *nullable++ = 1; + stk->nullable = true; /* The firstpos and lastpos of the empty leaf are both empty. */ - *nfirstpos++ = *nlastpos++ = 0; + stk->nfirstpos = stk->nlastpos = 0; + stk++; break; case STAR: case PLUS: /* Every element in the firstpos of the argument is in the follow of every element in the lastpos. */ - tmp.nelem = nfirstpos[-1]; + tmp.nelem = stk[-1].nfirstpos; tmp.elems = firstpos; pos = lastpos; - for (j = 0; j < nlastpos[-1]; ++j) + for (j = 0; j < stk[-1].nlastpos; ++j) { merge (&tmp, &d->follows[pos[j].index], &merged); copy (&merged, &d->follows[pos[j].index]); @@ -2467,16 +2442,16 @@ dfaanalyze (struct dfa *d, int searchflag) case QMARK: /* A QMARK or STAR node is automatically nullable. */ if (d->tokens[i] != PLUS) - nullable[-1] = 1; + stk[-1].nullable = true; break; case CAT: /* Every element in the firstpos of the second argument is in the follow of every element in the lastpos of the first argument. */ - tmp.nelem = nfirstpos[-1]; + tmp.nelem = stk[-1].nfirstpos; tmp.elems = firstpos; - pos = lastpos + nlastpos[-1]; - for (j = 0; j < nlastpos[-2]; ++j) + pos = lastpos + stk[-1].nlastpos; + for (j = 0; j < stk[-2].nlastpos; ++j) { merge (&tmp, &d->follows[pos[j].index], &merged); copy (&merged, &d->follows[pos[j].index]); @@ -2484,43 +2459,39 @@ dfaanalyze (struct dfa *d, int searchflag) /* The firstpos of a CAT node is the firstpos of the first argument, union that of the second argument if the first is nullable. */ - if (nullable[-2]) - nfirstpos[-2] += nfirstpos[-1]; + if (stk[-2].nullable) + stk[-2].nfirstpos += stk[-1].nfirstpos; else - firstpos += nfirstpos[-1]; - --nfirstpos; + firstpos += stk[-1].nfirstpos; /* The lastpos of a CAT node is the lastpos of the second argument, union that of the first argument if the second is nullable. */ - if (nullable[-1]) - nlastpos[-2] += nlastpos[-1]; + if (stk[-1].nullable) + stk[-2].nlastpos += stk[-1].nlastpos; else { - pos = lastpos + nlastpos[-2]; - for (j = nlastpos[-1]; j-- > 0;) + pos = lastpos + stk[-2].nlastpos; + for (j = stk[-1].nlastpos; j-- > 0;) pos[j] = lastpos[j]; - lastpos += nlastpos[-2]; - nlastpos[-2] = nlastpos[-1]; + lastpos += stk[-2].nlastpos; + stk[-2].nlastpos = stk[-1].nlastpos; } - --nlastpos; /* A CAT node is nullable if both arguments are nullable. */ - nullable[-2] = nullable[-1] && nullable[-2]; - --nullable; + stk[-2].nullable &= stk[-1].nullable; + stk--; break; case OR: /* The firstpos is the union of the firstpos of each argument. */ - nfirstpos[-2] += nfirstpos[-1]; - --nfirstpos; + stk[-2].nfirstpos += stk[-1].nfirstpos; /* The lastpos is the union of the lastpos of each argument. */ - nlastpos[-2] += nlastpos[-1]; - --nlastpos; + stk[-2].nlastpos += stk[-1].nlastpos; /* An OR node is nullable if either argument is nullable. */ - nullable[-2] = nullable[-1] || nullable[-2]; - --nullable; + stk[-2].nullable |= stk[-1].nullable; + stk--; break; default: @@ -2529,10 +2500,12 @@ dfaanalyze (struct dfa *d, int searchflag) an "epsilon closure" effectively makes them nullable later. Backreferences have to get a real position so we can detect transitions on them later. But they are nullable. */ - *nullable++ = d->tokens[i] == BACKREF; + stk->nullable = d->tokens[i] == BACKREF; /* This position is in its own firstpos and lastpos. */ - *nfirstpos++ = *nlastpos++ = 1; + stk->nfirstpos = stk->nlastpos = 1; + stk++; + --firstpos, --lastpos; firstpos->index = lastpos->index = i; firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; @@ -2546,15 +2519,16 @@ dfaanalyze (struct dfa *d, int searchflag) fprintf (stderr, "node %zd:", i); prtok (d->tokens[i]); putc ('\n', stderr); - fprintf (stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); + fprintf (stderr, + stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n"); fprintf (stderr, " firstpos:"); - for (j = nfirstpos[-1]; j-- > 0;) + for (j = stk[-1].nfirstpos; j-- > 0;) { fprintf (stderr, " %zd:", firstpos[j].index); prtok (d->tokens[firstpos[j].index]); } fprintf (stderr, "\n lastpos:"); - for (j = nlastpos[-1]; j-- > 0;) + for (j = stk[-1].nlastpos; j-- > 0;) { fprintf (stderr, " %zd:", lastpos[j].index); prtok (d->tokens[lastpos[j].index]); @@ -2591,25 +2565,18 @@ dfaanalyze (struct dfa *d, int searchflag) /* Get the epsilon closure of the firstpos of the regexp. The result will be the set of positions of state 0. */ merged.nelem = 0; - for (i = 0; i < nfirstpos[-1]; ++i) + for (i = 0; i < stk[-1].nfirstpos; ++i) insert (firstpos[i], &merged); epsclosure (&merged, d); /* Build the initial state. */ - d->salloc = 1; - d->sindex = 0; - MALLOC (d->states, d->salloc); - separate_contexts = state_separate_contexts (&merged); state_index (d, &merged, (separate_contexts & CTX_NEWLINE ? CTX_NEWLINE : separate_contexts ^ CTX_ANY)); - free (o_nullable); - free (o_nfirst); - free (o_firstpos); - free (o_nlast); - free (o_lastpos); + free (posalloc); + free (stkalloc); free (merged.elems); } @@ -2647,16 +2614,16 @@ dfaanalyze (struct dfa *d, int searchflag) void dfastate (state_num s, struct dfa *d, state_num trans[]) { - leaf_set *grps; /* As many as will ever be needed. */ - charclass *labels; /* Labels corresponding to the groups. */ + leaf_set grps[NOTCHAR]; /* As many as will ever be needed. */ + charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ size_t ngrps = 0; /* Number of groups actually used. */ position pos; /* Current position being considered. */ charclass matches; /* Set of matching characters. */ - int matchesf; /* True if matches is nonempty. */ + unsigned int matchesf; /* Nonzero if matches is nonempty. */ charclass intersect; /* Intersection with some label set. */ - int intersectf; /* True if intersect is nonempty. */ + unsigned int intersectf; /* Nonzero if intersect is nonempty. */ charclass leftovers; /* Stuff in the label that didn't match. */ - int leftoversf; /* True if leftovers is nonempty. */ + unsigned int leftoversf; /* Nonzero if leftovers is nonempty. */ position_set follows; /* Union of the follows of some group. */ position_set tmp; /* Temporary space for merging sets. */ int possible_contexts; /* Contexts that this group can match. */ @@ -2664,12 +2631,9 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) state_num state; /* New state. */ state_num state_newline; /* New state on a newline transition. */ state_num state_letter; /* New state on a letter transition. */ - int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */ + bool next_isnt_1st_byte = false; /* Flag if we can't add state0. */ size_t i, j, k; - MALLOC (grps, NOTCHAR); - MALLOC (labels, NOTCHAR); - zeroset (matches); for (i = 0; i < d->states[s].elems.nelem; ++i) @@ -2732,7 +2696,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) matches. */ intersectf = 0; for (k = 0; k < CHARCLASS_INTS; ++k) - (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; + intersectf |= intersect[k] = matches[k] & labels[j][k]; if (!intersectf) continue; @@ -2743,8 +2707,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* Even an optimizing compiler can't know this for sure. */ int match = matches[k], label = labels[j][k]; - (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; - (matches[k] = match & ~label) ? (matchesf = 1) : 0; + leftoversf |= leftovers[k] = ~match & label; + matchesf |= matches[k] = match & ~label; } /* If there were leftovers, create a new group labeled with them. */ @@ -2752,7 +2716,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) { copyset (leftovers, labels[ngrps]); copyset (intersect, labels[j]); - MALLOC (grps[ngrps].elems, d->nleaves); + grps[ngrps].elems = xnmalloc (d->nleaves, + sizeof *grps[ngrps].elems); memcpy (grps[ngrps].elems, grps[j].elems, sizeof (grps[j].elems[0]) * grps[j].nelem); grps[ngrps].nelem = grps[j].nelem; @@ -2775,7 +2740,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) { copyset (matches, labels[ngrps]); zeroset (matches); - MALLOC (grps[ngrps].elems, d->nleaves); + grps[ngrps].elems = xnmalloc (d->nleaves, sizeof *grps[ngrps].elems); grps[ngrps].nelem = 1; grps[ngrps].elems[0] = pos.index; ++ngrps; @@ -2821,7 +2786,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k) insert (d->follows[grps[i].elems[j]].elems[k], &follows); - if (d->mb_cur_max > 1) + if (d->multibyte) { /* If a token in follows.elems is not 1st byte of a multibyte character, or the states of follows must accept the bytes @@ -2841,12 +2806,12 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) codepoint of <sb a>, it must not be <sb a> but 2nd byte of <mb A>, so we cannot add state[0]. */ - next_isnt_1st_byte = 0; + next_isnt_1st_byte = false; for (j = 0; j < follows.nelem; ++j) { if (!(d->multibyte_prop[follows.elems[j].index] & 1)) { - next_isnt_1st_byte = 1; + next_isnt_1st_byte = true; break; } } @@ -2855,7 +2820,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* If we are building a searching matcher, throw in the positions of state 0 as well. */ if (d->searchflag - && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte))) + && (!MBS_SUPPORT || (!d->multibyte || !next_isnt_1st_byte))) for (j = 0; j < d->states[0].elems.nelem; ++j) insert (d->states[0].elems.elems[j], &follows); @@ -2897,8 +2862,31 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) free (grps[i].elems); free (follows.elems); free (tmp.elems); - free (grps); - free (labels); +} + +/* Make sure D's state arrays are large enough to hold NEW_STATE. */ +static void +realloc_trans_if_necessary (struct dfa *d, state_num new_state) +{ + state_num oldalloc = d->tralloc; + if (oldalloc <= new_state) + { + state_num **realtrans = d->trans ? d->trans - 1 : NULL; + size_t newalloc, newalloc1; + newalloc1 = new_state + 1; + realtrans = x2nrealloc (realtrans, &newalloc1, sizeof *realtrans); + realtrans[0] = NULL; + d->trans = realtrans + 1; + d->tralloc = newalloc = newalloc1 - 1; + d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails); + d->success = xnrealloc (d->success, newalloc, sizeof *d->success); + d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines); + for (; oldalloc < newalloc; oldalloc++) + { + d->trans[oldalloc] = NULL; + d->fails[oldalloc] = NULL; + } + } } /* Some routines for manipulating a compiled dfa's transition tables. @@ -2912,7 +2900,7 @@ static void build_state (state_num s, struct dfa *d) { state_num *trans; /* The new transition table. */ - state_num i; + state_num i, maxstate; /* Set an upper limit on the number of transition tables that will ever exist at once. 1024 is arbitrary. The idea is that the frequently @@ -2940,30 +2928,17 @@ build_state (state_num s, struct dfa *d) if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d)) d->success[s] |= CTX_NONE; - MALLOC (trans, NOTCHAR); + trans = xmalloc (NOTCHAR * sizeof *trans); dfastate (s, d, trans); /* Now go through the new transition table, and make sure that the trans and fail arrays are allocated large enough to hold a pointer for the largest state mentioned in the table. */ + maxstate = -1; for (i = 0; i < NOTCHAR; ++i) - if (trans[i] >= d->tralloc) - { - state_num oldalloc = d->tralloc; - - while (trans[i] >= d->tralloc) - d->tralloc *= 2; - REALLOC (d->realtrans, d->tralloc + 1); - d->trans = d->realtrans + 1; - REALLOC (d->fails, d->tralloc); - REALLOC (d->success, d->tralloc); - REALLOC (d->newlines, d->tralloc); - while (oldalloc < d->tralloc) - { - d->trans[oldalloc] = NULL; - d->fails[oldalloc++] = NULL; - } - } + if (maxstate < trans[i]) + maxstate = trans[i]; + realloc_trans_if_necessary (d, maxstate); /* Keep the newline transition in a special place so we can use it as a sentinel. */ @@ -2979,65 +2954,21 @@ build_state (state_num s, struct dfa *d) static void build_state_zero (struct dfa *d) { - d->tralloc = 1; + /* Initial size of the transition tables; must be positive. */ + int initial_tab_size = 1; + + d->tralloc = 0; d->trcount = 0; - CALLOC (d->realtrans, d->tralloc + 1); - d->trans = d->realtrans + 1; - CALLOC (d->fails, d->tralloc); - MALLOC (d->success, d->tralloc); - MALLOC (d->newlines, d->tralloc); + d->trans = NULL; + d->fails = NULL; + d->success = NULL; + d->newlines = NULL; + realloc_trans_if_necessary (d, initial_tab_size); build_state (0, d); } /* Multibyte character handling sub-routines for dfaexec. */ -/* The initial state may encounter a byte which is not a single byte character - nor the first byte of a multibyte character. But it is incorrect for the - initial state to accept such a byte. For example, in Shift JIS the regular - expression "\\" accepts the codepoint 0x5c, but should not accept the second - byte of the codepoint 0x815c. Then the initial state must skip the bytes - that are not a single byte character nor the first byte of a multibyte - character. */ -#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ - if (s == 0) \ - { \ - while (inputwcs[p - buf_begin] == 0 \ - && mblen_buf[p - buf_begin] > 0 \ - && (unsigned char const *) p < buf_end) \ - ++p; \ - if ((char *) p >= end) \ - { \ - free (mblen_buf); \ - free (inputwcs); \ - *end = saved_end; \ - return NULL; \ - } \ - } - -static void -realloc_trans_if_necessary (struct dfa *d, state_num new_state) -{ - /* Make sure that the trans and fail arrays are allocated large enough - to hold a pointer for the new state. */ - if (new_state >= d->tralloc) - { - state_num oldalloc = d->tralloc; - - while (new_state >= d->tralloc) - d->tralloc *= 2; - REALLOC (d->realtrans, d->tralloc + 1); - d->trans = d->realtrans + 1; - REALLOC (d->fails, d->tralloc); - REALLOC (d->success, d->tralloc); - REALLOC (d->newlines, d->tralloc); - while (oldalloc < d->tralloc) - { - d->trans[oldalloc] = NULL; - d->fails[oldalloc++] = NULL; - } - } -} - /* Return values of transit_state_singlebyte, and transit_state_consume_1char. */ typedef enum @@ -3070,14 +3001,7 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p, works = 0; } else if (works < 0) - { - if (p == buf_end) - { - /* At the moment, it must not happen. */ - abort (); - } - works = 0; - } + works = 0; else if (d->fails[works]) { works = d->fails[works][*p]; @@ -3092,18 +3016,13 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p, return rval; } -/* Match a "." against the current context. buf_begin[IDX] is the - current position. Return the length of the match, in bytes. - POS is the position of the ".". */ +/* Match a "." against the current context. Return the length of the + match, in bytes. POS is the position of the ".". */ static int -match_anychar (struct dfa *d, state_num s, position pos, size_t idx) +match_anychar (struct dfa *d, state_num s, position pos, + wchar_t wc, size_t mbclen) { int context; - wchar_t wc; - int mbclen; - - wc = inputwcs[idx]; - mbclen = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx]; /* Check syntax bits. */ if (wc == (wchar_t) eolbyte) @@ -3125,16 +3044,14 @@ match_anychar (struct dfa *d, state_num s, position pos, size_t idx) } /* Match a bracket expression against the current context. - buf_begin[IDX] is the current position. Return the length of the match, in bytes. POS is the position of the bracket expression. */ static int -match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) +match_mb_charset (struct dfa *d, state_num s, position pos, + char const *p, wchar_t wc, size_t match_len) { size_t i; - int match; /* Matching succeeded. */ - int match_len; /* Length of the character (or collating element) - with which this operator matches. */ + bool match; /* Matching succeeded. */ int op_len; /* Length of the operator. */ char buffer[128]; @@ -3142,9 +3059,6 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) struct mb_char_classes *work_mbc; int context; - wchar_t wc; /* Current referring character. */ - - wc = inputwcs[idx]; /* Check syntax bits. */ if (wc == (wchar_t) eolbyte) @@ -3165,7 +3079,6 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) /* Assign the current referring operator to work_mbc. */ work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); match = !work_mbc->invert; - match_len = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx]; /* Match in range 0-255? */ if (wc < NOTCHAR && work_mbc->cset != -1 @@ -3179,14 +3092,14 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) goto charset_matched; } - strncpy (buffer, (char const *) buf_begin + idx, match_len); + strncpy (buffer, p, match_len); buffer[match_len] = '\0'; /* match with an equivalence class? */ for (i = 0; i < work_mbc->nequivs; i++) { op_len = strlen (work_mbc->equivs[i]); - strncpy (buffer, (char const *) buf_begin + idx, op_len); + strncpy (buffer, p, op_len); buffer[op_len] = '\0'; if (strcoll (work_mbc->equivs[i], buffer) == 0) { @@ -3199,7 +3112,7 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) for (i = 0; i < work_mbc->ncoll_elems; i++) { op_len = strlen (work_mbc->coll_elems[i]); - strncpy (buffer, (char const *) buf_begin + idx, op_len); + strncpy (buffer, p, op_len); buffer[op_len] = '\0'; if (strcoll (work_mbc->coll_elems[i], buffer) == 0) @@ -3212,7 +3125,7 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) /* match with a range? */ for (i = 0; i < work_mbc->nranges; i++) { - if (work_mbc->range_sts[i] <= wc && wc <= work_mbc->range_ends[i]) + if (work_mbc->ranges[i].beg <= wc && wc <= work_mbc->ranges[i].end) goto charset_matched; } @@ -3233,27 +3146,25 @@ charset_matched: array which corresponds to 'd->states[s].mbps.elem'; each element of the array contains the number of bytes with which the element can match. - 'idx' is the index from buf_begin, and it is the current position - in the buffer. - The caller MUST free the array which this function return. */ static int * -check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx) +check_matching_with_multibyte_ops (struct dfa *d, state_num s, + char const *p, wchar_t wc, size_t mbclen) { size_t i; int *rarray; - MALLOC (rarray, d->states[s].mbps.nelem); + rarray = d->mb_match_lens; for (i = 0; i < d->states[s].mbps.nelem; ++i) { position pos = d->states[s].mbps.elems[i]; switch (d->tokens[pos.index]) { case ANYCHAR: - rarray[i] = match_anychar (d, s, pos, idx); + rarray[i] = match_anychar (d, s, pos, wc, mbclen); break; case MBCSET: - rarray[i] = match_mb_charset (d, s, pos, idx); + rarray[i] = match_mb_charset (d, s, pos, p, wc, mbclen); break; default: break; /* cannot happen. */ @@ -3273,48 +3184,39 @@ check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx) static status_transit_state transit_state_consume_1char (struct dfa *d, state_num s, unsigned char const **pp, - int *match_lens, int *mbclen, position_set * pps) + wchar_t wc, size_t mbclen, + int *match_lens) { size_t i, j; int k; state_num s1, s2; - int *work_mbls; status_transit_state rs = TRANSIT_STATE_DONE; - /* Calculate the length of the (single/multi byte) character - to which p points. */ - *mbclen = (mblen_buf[*pp - buf_begin] == 0) ? 1 : mblen_buf[*pp - buf_begin]; + if (! match_lens && d->states[s].mbps.nelem != 0) + match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, + wc, mbclen); /* Calculate the state which can be reached from the state 's' by - consuming '*mbclen' single bytes from the buffer. */ + consuming 'mbclen' single bytes from the buffer. */ s1 = s; - for (k = 0; k < *mbclen; k++) + for (k = 0; k < mbclen; k++) { s2 = s1; rs = transit_state_singlebyte (d, s2, (*pp)++, &s1); } - /* Copy the positions contained by 's1' to the set 'pps'. */ - copy (&(d->states[s1].elems), pps); - - /* Check (input) match_lens, and initialize if it is NULL. */ - if (match_lens == NULL && d->states[s].mbps.nelem != 0) - work_mbls = check_matching_with_multibyte_ops (d, s, *pp - buf_begin); - else - work_mbls = match_lens; + copy (&d->states[s1].elems, &d->mb_follows); /* Add all of the positions which can be reached from 's' by consuming a single character. */ for (i = 0; i < d->states[s].mbps.nelem; i++) { - if (work_mbls[i] == *mbclen) + if (match_lens[i] == mbclen) for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; j++) - insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], pps); + insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], + &d->mb_follows); } - if (match_lens == NULL && work_mbls != NULL) - free (work_mbls); - /* FIXME: this return value is always ignored. */ return rs; } @@ -3323,7 +3225,8 @@ transit_state_consume_1char (struct dfa *d, state_num s, buffer. This function is for some operator which can match with a multi- byte character or a collating element (which may be multi characters). */ static state_num -transit_state (struct dfa *d, state_num s, unsigned char const **pp) +transit_state (struct dfa *d, state_num s, unsigned char const **pp, + unsigned char const *end) { state_num s1; int mbclen; /* The length of current input multibyte character. */ @@ -3331,7 +3234,6 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) size_t i, j; int *match_lens = NULL; size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */ - position_set follows; unsigned char const *p1 = *pp; wchar_t wc; @@ -3340,7 +3242,9 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) We check whether each of them can match or not. */ { /* Note: caller must free the return value of this function. */ - match_lens = check_matching_with_multibyte_ops (d, s, *pp - buf_begin); + mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); + match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, + wc, mbclen); for (i = 0; i < nelem; i++) /* Search the operator which match the longest string, @@ -3362,26 +3266,25 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) if (rs == TRANSIT_STATE_DONE) ++*pp; - free (match_lens); return s1; } /* This state has some operators which can match a multibyte character. */ - alloc_position_set (&follows, d->nleaves); + d->mb_follows.nelem = 0; /* 'maxlen' may be longer than the length of a character, because it may not be a character but a (multi character) collating element. We enumerate all of the positions which 's' can reach by consuming 'maxlen' bytes. */ - transit_state_consume_1char (d, s, pp, match_lens, &mbclen, &follows); + transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens); - wc = inputwcs[*pp - mbclen - buf_begin]; - s1 = state_index (d, &follows, wchar_context (wc)); + s1 = state_index (d, &d->mb_follows, wchar_context (wc)); realloc_trans_if_necessary (d, s1); while (*pp - p1 < maxlen) { - transit_state_consume_1char (d, s1, pp, NULL, &mbclen, &follows); + mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); + transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL); for (i = 0; i < nelem; i++) { @@ -3389,51 +3292,15 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j], - &follows); + &d->mb_follows); } - wc = inputwcs[*pp - mbclen - buf_begin]; - s1 = state_index (d, &follows, wchar_context (wc)); + s1 = state_index (d, &d->mb_follows, wchar_context (wc)); realloc_trans_if_necessary (d, s1); } - free (match_lens); - free (follows.elems); return s1; } - -/* Initialize mblen_buf and inputwcs with data from the next line. */ - -static void -prepare_wc_buf (struct dfa *d, const char *begin, const char *end) -{ -#if MBS_SUPPORT - unsigned char eol = eolbyte; - size_t i; - size_t ilim = end - begin + 1; - - buf_begin = (unsigned char *) begin; - - for (i = 0; i < ilim; i++) - { - size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i, &mbs); - mblen_buf[i] = nbytes - (nbytes == 1); - if (begin[i] == eol) - break; - while (--nbytes != 0) - { - i++; - mblen_buf[i] = nbytes; - inputwcs[i] = 0; - } - } - - buf_end = (unsigned char *) (begin + i); - mblen_buf[i] = 0; - inputwcs[i] = 0; /* sentinel */ -#endif /* MBS_SUPPORT */ -} - /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, and the shortest possible version thereof. Return a pointer to @@ -3451,39 +3318,64 @@ dfaexec (struct dfa *d, char const *begin, char *end, int allow_nl, size_t *count, int *backref) { state_num s, s1; /* Current state. */ - unsigned char const *p; /* Current input character. */ + unsigned char const *p, *mbp; /* Current input character. */ state_num **trans, *t; /* Copy of d->trans so it can be optimized into a register. */ unsigned char eol = eolbyte; /* Likewise for eolbyte. */ unsigned char saved_end; + size_t nlcount = 0; if (!d->tralloc) build_state_zero (d); s = s1 = 0; - p = (unsigned char const *) begin; + p = mbp = (unsigned char const *) begin; trans = d->trans; saved_end = *(unsigned char *) end; *end = eol; - if (d->mb_cur_max > 1) + if (d->multibyte) { - MALLOC (mblen_buf, end - begin + 2); - MALLOC (inputwcs, end - begin + 2); - memset (&mbs, 0, sizeof (mbstate_t)); - prepare_wc_buf (d, (const char *) p, end); + memset (&d->mbs, 0, sizeof d->mbs); + if (! d->mb_match_lens) + { + d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens); + alloc_position_set (&d->mb_follows, d->nleaves); + } } for (;;) { - if (d->mb_cur_max > 1) + if (d->multibyte) { while ((t = trans[s]) != NULL) { - if (p > buf_end) - break; s1 = s; - SKIP_REMAINS_MB_IF_INITIAL_STATE (s, p); + + if (s == 0) + { + /* The initial state may encounter a byte which is not + a single byte character nor the first byte of a + multibyte character. But it is incorrect for the + initial state to accept such a byte. For example, + in Shift JIS the regular expression "\\" accepts + the codepoint 0x5c, but should not accept the second + byte of the codepoint 0x815c. Then the initial + state must skip the bytes that are not a single + byte character nor the first byte of a multibyte + character. */ + wchar_t wc; + while (mbp < p) + mbp += mbs_to_wchar (&wc, (char const *) mbp, + end - (char const *) mbp, d); + p = mbp; + + if ((char *) p >= end) + { + p = NULL; + goto done; + } + } if (d->states[s].mbps.nelem == 0) { @@ -3495,18 +3387,16 @@ dfaexec (struct dfa *d, char const *begin, char *end, better performance (up to 25% better on [a-z], for example) and enables support for collating symbols and equivalence classes. */ - if (backref) + if (d->states[s].has_mbcset && backref) { *backref = 1; - free (mblen_buf); - free (inputwcs); - *end = saved_end; - return (char *) p; + goto done; } /* Can match with a multibyte character (and multi character collating element). Transition table might be updated. */ - s = transit_state (d, s, &p); + s = transit_state (d, s, &p, (unsigned char *) end); + mbp = p; trans = d->trans; } } @@ -3526,27 +3416,28 @@ dfaexec (struct dfa *d, char const *begin, char *end, } } - if (s >= 0 && (char *) p <= end && d->fails[s]) + if ((char *) p > end) + { + p = NULL; + goto done; + } + + if (s >= 0 && d->fails[s]) { if (d->success[s] & sbit[*p]) { if (backref) - *backref = (d->states[s].backref != 0); - if (d->mb_cur_max > 1) - { - free (mblen_buf); - free (inputwcs); - } - *end = saved_end; - return (char *) p; + *backref = d->states[s].has_backref; + goto done; } s1 = s; - if (d->mb_cur_max > 1) + if (d->multibyte) { /* Can match with a multibyte character (and multicharacter collating element). Transition table might be updated. */ - s = transit_state (d, s, &p); + s = transit_state (d, s, &p, (unsigned char *) end); + mbp = p; trans = d->trans; } else @@ -3554,31 +3445,18 @@ dfaexec (struct dfa *d, char const *begin, char *end, continue; } - /* If the previous character was a newline, count it. */ - if ((char *) p <= end && p[-1] == eol) + /* If the previous character was a newline, count it, and skip + checking of multibyte character boundary until here. */ + if (p[-1] == eol) { - if (count) - ++*count; - - if (d->mb_cur_max > 1) - prepare_wc_buf (d, (const char *) p, end); - } - - /* Check if we've run off the end of the buffer. */ - if ((char *) p > end) - { - if (d->mb_cur_max > 1) - { - free (mblen_buf); - free (inputwcs); - } - *end = saved_end; - return NULL; + nlcount++; + mbp = p; } if (s >= 0) { - build_state (s, d); + if (!d->trans[s]) + build_state (s, d); trans = d->trans; continue; } @@ -3591,6 +3469,32 @@ dfaexec (struct dfa *d, char const *begin, char *end, s = 0; } + + done: + if (count) + *count += nlcount; + *end = saved_end; + return (char *) p; +} + +/* Search through a buffer looking for a potential match for D. + Return the offset of the byte after the first potential match. + If there is no match, return (size_t) -1. If D lacks a superset + so it's not known whether there is a match, return (size_t) -2. + BEGIN points to the beginning of the buffer, and END points to the + first byte after its end. Store a sentinel byte (usually newline) + in *END, so the actual buffer must be one byte longer. If COUNT is + non-NULL, increment *COUNT once for each newline processed. */ +size_t +dfahint (struct dfa *d, char const *begin, char *end, size_t *count) +{ + if (! d->superset) + return -2; + else + { + char const *match = dfaexec (d->superset, begin, end, 1, count, NULL); + return match ? match - begin : -1; + } } static void @@ -3599,7 +3503,6 @@ free_mbdata (struct dfa *d) size_t i; free (d->multibyte_prop); - d->multibyte_prop = NULL; for (i = 0; i < d->nmbcsets; ++i) { @@ -3607,8 +3510,7 @@ free_mbdata (struct dfa *d) struct mb_char_classes *p = &(d->mbcsets[i]); free (p->chars); free (p->ch_classes); - free (p->range_sts); - free (p->range_ends); + free (p->ranges); for (j = 0; j < p->nequivs; ++j) free (p->equivs[j]); @@ -3620,8 +3522,9 @@ free_mbdata (struct dfa *d) } free (d->mbcsets); - d->mbcsets = NULL; - d->nmbcsets = 0; + free (d->mb_follows.elems); + free (d->mb_match_lens); + d->mb_match_lens = NULL; } /* Initialize the components of a dfa that the other routines don't @@ -3630,22 +3533,7 @@ void dfainit (struct dfa *d) { memset (d, 0, sizeof *d); - - d->calloc = 1; - MALLOC (d->charclasses, d->calloc); - - d->talloc = 1; - MALLOC (d->tokens, d->talloc); - - d->mb_cur_max = MB_CUR_MAX; - - if (d->mb_cur_max > 1) - { - d->nmultibyte_prop = 1; - MALLOC (d->multibyte_prop, d->nmultibyte_prop); - d->mbcsets_alloc = 1; - MALLOC (d->mbcsets, d->mbcsets_alloc); - } + d->multibyte = MB_CUR_MAX > 1; } static void @@ -3672,7 +3560,83 @@ dfaoptimize (struct dfa *d) } free_mbdata (d); - d->mb_cur_max = 1; + d->multibyte = false; +} + +static void +dfasuperset (struct dfa *d) +{ + size_t i, j; + charclass ccl; + bool have_achar = false; + bool have_nchar = false; + struct dfa *sup = dfaalloc (); + + *sup = *d; + sup->multibyte = false; + sup->multibyte_prop = NULL; + sup->mbcsets = NULL; + sup->superset = NULL; + sup->states = NULL; + sup->sindex = 0; + sup->follows = NULL; + sup->tralloc = 0; + sup->trans = NULL; + sup->fails = NULL; + sup->success = NULL; + sup->newlines = NULL; + sup->musts = NULL; + + sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses); + memcpy (sup->charclasses, d->charclasses, + d->cindex * sizeof *sup->charclasses); + + sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens); + sup->talloc = d->tindex * 2; + + for (i = j = 0; i < d->tindex; i++) + { + switch (d->tokens[i]) + { + case ANYCHAR: + case MBCSET: + case BACKREF: + zeroset (ccl); + notset (ccl); + sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl); + sup->tokens[j++] = STAR; + if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR + || d->tokens[i + 1] == PLUS) + i++; + have_achar = true; + break; + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + if (d->multibyte) + { + /* Ignore these constraints. */ + sup->tokens[j++] = EMPTY; + break; + } + default: + sup->tokens[j++] = d->tokens[i]; + if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR) + || d->tokens[i] >= CSET) + have_nchar = true; + break; + } + } + sup->tindex = j; + + if (have_nchar && (have_achar || d->multibyte)) + d->superset = sup; + else + { + dfafree (sup); + free (sup); + } } /* Parse and analyze a single string of the given length. */ @@ -3684,7 +3648,10 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) dfaparse (s, len, d); dfamust (d); dfaoptimize (d); + dfasuperset (d); dfaanalyze (d, searchflag); + if (d->superset) + dfaanalyze (d->superset, searchflag); } /* Free the storage held by the components of a dfa. */ @@ -3697,34 +3664,46 @@ dfafree (struct dfa *d) free (d->charclasses); free (d->tokens); - if (d->mb_cur_max > 1) + if (d->multibyte) free_mbdata (d); for (i = 0; i < d->sindex; ++i) { free (d->states[i].elems.elems); - if (MBS_SUPPORT) - free (d->states[i].mbps.elems); + free (d->states[i].mbps.elems); } free (d->states); - for (i = 0; i < d->tindex; ++i) - free (d->follows[i].elems); - free (d->follows); - for (i = 0; i < d->tralloc; ++i) + + if (d->follows) + { + for (i = 0; i < d->tindex; ++i) + free (d->follows[i].elems); + free (d->follows); + } + + if (d->trans) { - free (d->trans[i]); - free (d->fails[i]); + for (i = 0; i < d->tralloc; ++i) + { + free (d->trans[i]); + free (d->fails[i]); + } + + free (d->trans - 1); + free (d->fails); + free (d->newlines); + free (d->success); } - free (d->realtrans); - free (d->fails); - free (d->newlines); - free (d->success); + for (dm = d->musts; dm; dm = ndm) { ndm = dm->next; free (dm->must); free (dm); } + + if (d->superset) + dfafree (d->superset); } /* Having found the postfix representation of the regular expression, @@ -3815,21 +3794,16 @@ static char * icatalloc (char *old, char const *new) { char *result; - size_t oldsize = old == NULL ? 0 : strlen (old); - size_t newsize = new == NULL ? 0 : strlen (new); + size_t oldsize; + size_t newsize = strlen (new); if (newsize == 0) return old; + oldsize = strlen (old); result = xrealloc (old, oldsize + newsize + 1); memcpy (result + oldsize, new, newsize + 1); return result; } -static char * -icpyalloc (char const *string) -{ - return icatalloc (NULL, string); -} - static char *_GL_ATTRIBUTE_PURE istrstr (char const *lookin, char const *lookfor) { @@ -3846,29 +3820,15 @@ istrstr (char const *lookin, char const *lookfor) static void freelist (char **cpp) { - size_t i; - - if (cpp == NULL) - return; - for (i = 0; cpp[i] != NULL; ++i) - { - free (cpp[i]); - cpp[i] = NULL; - } + while (*cpp) + free (*cpp++); } static char ** enlist (char **cpp, char *new, size_t len) { size_t i, j; - - if (cpp == NULL) - return NULL; - if ((new = icpyalloc (new)) == NULL) - { - freelist (cpp); - return NULL; - } + new = memcpy (xmalloc (len + 1), new, len); new[len] = '\0'; /* Is there already something in the list that's new (or longer)? */ for (i = 0; cpp[i] != NULL; ++i) @@ -3891,53 +3851,35 @@ enlist (char **cpp, char *new, size_t len) cpp[i] = NULL; } /* Add the new string. */ - REALLOC (cpp, i + 2); + cpp = xnrealloc (cpp, i + 2, sizeof *cpp); cpp[i] = new; cpp[i + 1] = NULL; return cpp; } /* Given pointers to two strings, return a pointer to an allocated - list of their distinct common substrings. Return NULL if something - seems wild. */ + list of their distinct common substrings. */ static char ** comsubs (char *left, char const *right) { - char **cpp; + char **cpp = xzalloc (sizeof *cpp); char *lcp; - char *rcp; - size_t i, len; - - if (left == NULL || right == NULL) - return NULL; - cpp = malloc (sizeof *cpp); - if (cpp == NULL) - return NULL; - cpp[0] = NULL; + for (lcp = left; *lcp != '\0'; ++lcp) { - len = 0; - rcp = strchr (right, *lcp); + size_t len = 0; + char *rcp = strchr (right, *lcp); while (rcp != NULL) { + size_t i; for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) continue; if (i > len) len = i; rcp = strchr (rcp + 1, *lcp); } - if (len == 0) - continue; - { - char **p = enlist (cpp, lcp, len); - if (p == NULL) - { - freelist (cpp); - cpp = NULL; - break; - } - cpp = p; - } + if (len != 0) + cpp = enlist (cpp, lcp, len); } return cpp; } @@ -3945,16 +3887,8 @@ comsubs (char *left, char const *right) static char ** addlists (char **old, char **new) { - size_t i; - - if (old == NULL || new == NULL) - return NULL; - for (i = 0; new[i] != NULL; ++i) - { - old = enlist (old, new[i], strlen (new[i])); - if (old == NULL) - break; - } + for (; *new; new++) + old = enlist (old, *new, strlen (*new)); return old; } @@ -3963,125 +3897,127 @@ addlists (char **old, char **new) static char ** inboth (char **left, char **right) { - char **both; - char **temp; + char **both = xzalloc (sizeof *both); size_t lnum, rnum; - if (left == NULL || right == NULL) - return NULL; - both = malloc (sizeof *both); - if (both == NULL) - return NULL; - both[0] = NULL; for (lnum = 0; left[lnum] != NULL; ++lnum) { for (rnum = 0; right[rnum] != NULL; ++rnum) { - temp = comsubs (left[lnum], right[rnum]); - if (temp == NULL) - { - freelist (both); - return NULL; - } + char **temp = comsubs (left[lnum], right[rnum]); both = addlists (both, temp); freelist (temp); free (temp); - if (both == NULL) - return NULL; } } return both; } -typedef struct +typedef struct must must; + +struct must { char **in; char *left; char *right; char *is; -} must; + bool begline; + bool endline; + must *prev; +}; + +static must * +allocmust (must *mp) +{ + must *new_mp = xmalloc (sizeof *new_mp); + new_mp->in = xzalloc (sizeof *new_mp->in); + new_mp->left = xzalloc (2); + new_mp->right = xzalloc (2); + new_mp->is = xzalloc (2); + new_mp->begline = false; + new_mp->endline = false; + new_mp->prev = mp; + return new_mp; +} static void -resetmust (must * mp) +resetmust (must *mp) { + freelist (mp->in); + mp->in[0] = NULL; mp->left[0] = mp->right[0] = mp->is[0] = '\0'; + mp->begline = false; + mp->endline = false; +} + +static void +freemust (must *mp) +{ freelist (mp->in); + free (mp->in); + free (mp->left); + free (mp->right); + free (mp->is); + free (mp); } static void dfamust (struct dfa *d) { - must *musts; - must *mp; - char *result; + must *mp = NULL; + char const *result = ""; size_t ri; size_t i; - int exact; - token t; - static must must0; + bool exact = false; + bool begline = false; + bool endline = false; struct dfamust *dm; - static char empty_string[] = ""; - - result = empty_string; - exact = 0; - MALLOC (musts, d->tindex + 1); - mp = musts; - for (i = 0; i <= d->tindex; ++i) - mp[i] = must0; - for (i = 0; i <= d->tindex; ++i) - { - mp[i].in = xmalloc (sizeof *mp[i].in); - mp[i].left = xmalloc (2); - mp[i].right = xmalloc (2); - mp[i].is = xmalloc (2); - mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; - mp[i].in[0] = NULL; - } -#ifdef DEBUG - fprintf (stderr, "dfamust:\n"); - for (i = 0; i < d->tindex; ++i) - { - fprintf (stderr, " %zd:", i); - prtok (d->tokens[i]); - } - putc ('\n', stderr); -#endif + for (ri = 0; ri < d->tindex; ++ri) { - switch (t = d->tokens[ri]) + token t = d->tokens[ri]; + switch (t) { + case BEGLINE: + mp = allocmust (mp); + mp->begline = true; + break; + case ENDLINE: + mp = allocmust (mp); + mp->endline = true; + break; case LPAREN: case RPAREN: assert (!"neither LPAREN nor RPAREN may appear here"); + case EMPTY: - case BEGLINE: - case ENDLINE: case BEGWORD: case ENDWORD: case LIMWORD: case NOTLIMWORD: case BACKREF: - resetmust (mp); + case ANYCHAR: + case MBCSET: + mp = allocmust (mp); break; + case STAR: case QMARK: - assert (musts < mp); - --mp; resetmust (mp); break; + case OR: - assert (&musts[2] <= mp); { char **new; - must *lmp; - must *rmp; + must *rmp = mp; + must *lmp = mp = mp->prev; size_t j, ln, rn, n; - rmp = --mp; - lmp = --mp; /* Guaranteed to be. Unlikely, but ... */ if (!STREQ (lmp->is, rmp->is)) lmp->is[0] = '\0'; + lmp->begline &= rmp->begline; + lmp->endline &= rmp->endline; /* Left side--easy */ i = 0; while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) @@ -4100,133 +4036,126 @@ dfamust (struct dfa *d) lmp->right[j] = lmp->right[(ln - i) + j]; lmp->right[j] = '\0'; new = inboth (lmp->in, rmp->in); - if (new == NULL) - goto done; freelist (lmp->in); free (lmp->in); lmp->in = new; + freemust (rmp); } break; + case PLUS: - assert (musts < mp); - --mp; mp->is[0] = '\0'; break; + case END: - assert (mp == &musts[1]); - for (i = 0; musts[0].in[i] != NULL; ++i) - if (strlen (musts[0].in[i]) > strlen (result)) - result = musts[0].in[i]; - if (STREQ (result, musts[0].is)) - exact = 1; + assert (!mp->prev); + for (i = 0; mp->in[i] != NULL; ++i) + if (strlen (mp->in[i]) > strlen (result)) + result = mp->in[i]; + if (STREQ (result, mp->is)) + { + exact = true; + begline = mp->begline; + endline = mp->endline; + } goto done; + case CAT: - assert (&musts[2] <= mp); { - must *lmp; - must *rmp; + must *rmp = mp; + must *lmp = mp = mp->prev; - rmp = --mp; - lmp = --mp; /* In. Everything in left, plus everything in right, plus concatenation of left's right and right's left. */ lmp->in = addlists (lmp->in, rmp->in); - if (lmp->in == NULL) - goto done; if (lmp->right[0] != '\0' && rmp->left[0] != '\0') { - char *tp; - - tp = icpyalloc (lmp->right); - tp = icatalloc (tp, rmp->left); - lmp->in = enlist (lmp->in, tp, strlen (tp)); + size_t lrlen = strlen (lmp->right); + size_t rllen = strlen (rmp->left); + char *tp = xmalloc (lrlen + rllen); + memcpy (tp, lmp->right, lrlen); + memcpy (tp + lrlen, rmp->left, rllen); + lmp->in = enlist (lmp->in, tp, lrlen + rllen); free (tp); - if (lmp->in == NULL) - goto done; } /* Left-hand */ if (lmp->is[0] != '\0') - { - lmp->left = icatalloc (lmp->left, rmp->left); - if (lmp->left == NULL) - goto done; - } + lmp->left = icatalloc (lmp->left, rmp->left); /* Right-hand */ if (rmp->is[0] == '\0') lmp->right[0] = '\0'; lmp->right = icatalloc (lmp->right, rmp->right); - if (lmp->right == NULL) - goto done; /* Guaranteed to be */ - if (lmp->is[0] != '\0' && rmp->is[0] != '\0') + if ((lmp->is[0] != '\0' || lmp->begline) + && (rmp->is[0] != '\0' || rmp->endline)) { lmp->is = icatalloc (lmp->is, rmp->is); - if (lmp->is == NULL) - goto done; + lmp->endline = rmp->endline; } else - lmp->is[0] = '\0'; + { + lmp->is[0] = '\0'; + lmp->begline = false; + lmp->endline = false; + } + freemust (rmp); } break; + + case '\0': + /* Not on *my* shift. */ + goto done; + default: - if (t < END) - { - assert (!"oops! t >= END"); - } - else if (t == '\0') - { - /* not on *my* shift */ - goto done; - } - else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET) - { - /* easy enough */ - resetmust (mp); - } - else + mp = allocmust (mp); + if (CSET <= t) { - /* plain character */ - resetmust (mp); - mp->is[0] = mp->left[0] = mp->right[0] = t; - mp->is[1] = mp->left[1] = mp->right[1] = '\0'; - mp->in = enlist (mp->in, mp->is, (size_t) 1); - if (mp->in == NULL) - goto done; + /* If T is a singleton, or if case-folding in a unibyte + locale and T's members all case-fold to the same char, + convert T to one of its members. Otherwise, do + nothing further with T. */ + charclass *ccl = &d->charclasses[t - CSET]; + int j; + for (j = 0; j < NOTCHAR; j++) + if (tstbit (j, *ccl)) + break; + if (! (j < NOTCHAR)) + break; + t = j; + while (++j < NOTCHAR) + if (tstbit (j, *ccl) + && ! (case_fold && !d->multibyte + && toupper (j) == toupper (t))) + break; + if (j < NOTCHAR) + break; } + mp->is[0] = mp->left[0] = mp->right[0] + = case_fold && !d->multibyte ? toupper (t) : t; + mp->is[1] = mp->left[1] = mp->right[1] = '\0'; + mp->in = enlist (mp->in, mp->is, 1); break; } -#ifdef DEBUG - fprintf (stderr, " node: %zd:", ri); - prtok (d->tokens[ri]); - fprintf (stderr, "\n in:"); - for (i = 0; mp->in[i]; ++i) - fprintf (stderr, " \"%s\"", mp->in[i]); - fprintf (stderr, "\n is: \"%s\"\n", mp->is); - fprintf (stderr, " left: \"%s\"\n", mp->left); - fprintf (stderr, " right: \"%s\"\n", mp->right); -#endif - ++mp; } done: - if (strlen (result)) + if (*result) { - MALLOC (dm, 1); + dm = xmalloc (sizeof *dm); dm->exact = exact; - dm->must = xmemdup (result, strlen (result) + 1); + dm->begline = begline; + dm->endline = endline; + dm->must = xstrdup (result); dm->next = d->musts; d->musts = dm; } - mp = musts; - for (i = 0; i <= d->tindex; ++i) + + while (mp) { - freelist (mp[i].in); - free (mp[i].in); - free (mp[i].left); - free (mp[i].right); - free (mp[i].is); + must *prev = mp->prev; + freemust (mp); + mp = prev; } - free (mp); } struct dfa * @@ -19,13 +19,20 @@ /* Written June, 1988 by Mike Haertel */ #include <regex.h> +#ifdef HAVE_STDBOOL_H +#include <stdbool.h> +#else +#include "missing_d/gawkbool.h" +#endif /* HAVE_STDBOOL_H */ #include <stddef.h> /* Element of a list of strings, at least one of which is known to appear in any R.E. matching the DFA. */ struct dfamust { - int exact; + bool exact; + bool begline; + bool endline; char *must; struct dfamust *next; }; @@ -68,6 +75,17 @@ extern void dfacomp (char const *, size_t, struct dfa *, int); extern char *dfaexec (struct dfa *d, char const *begin, char *end, int newline, size_t *count, int *backref); +/* Search through a buffer looking for a potential match for D. + Return the offset of the byte after the first potential match. + If there is no match, return (size_t) -1. If D lacks a superset + so it's not known whether there is a match, return (size_t) -2. + BEGIN points to the beginning of the buffer, and END points to the + first byte after its end. Store a sentinel byte (usually newline) + in *END, so the actual buffer must be one byte longer. If COUNT is + non-NULL, increment *COUNT once for each newline processed. */ +extern size_t dfahint (struct dfa *d, char const *begin, char *end, + size_t *count); + /* Free the storage held by the components of a struct dfa. */ extern void dfafree (struct dfa *); @@ -101,11 +119,3 @@ extern void dfawarn (const char *); extern _Noreturn void dfaerror (const char *); extern int using_utf8 (void); - -/* Maximum number of characters that can be the case-folded - counterparts of a single character, not counting the character - itself. This is 1 for towupper, 1 for towlower, and 1 for each - entry in LONESOME_LOWER; see dfa.c. */ -enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 }; - -extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]); |