diff options
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | dfa.c | 66 | ||||
-rw-r--r-- | dfa.h | 30 | ||||
-rw-r--r-- | re.c | 2 |
4 files changed, 54 insertions, 49 deletions
@@ -1,5 +1,10 @@ 2016-05-02 Arnold D. Robbins <arnold@skeeve.com> + * dfa.h, dfa.c: Sync with grep. + * re.c (research): Adjust type of try_backref. + +2016-05-02 Arnold D. Robbins <arnold@skeeve.com> + * awk.h (success_node): Declare. * array.c (success_node): Define. * cint_array.c, int_array.c, str_array.c: Use `& success_node' @@ -362,7 +362,8 @@ struct dfa mbstate_t mbs; /* Multibyte conversion state. */ /* dfaexec implementation. */ - char *(*dfaexec) (struct dfa *, char const *, char *, int, size_t *, int *); + char *(*dfaexec) (struct dfa *, char const *, char *, + bool, size_t *, bool *); /* The following are valid only if MB_CUR_MAX > 1. */ @@ -675,7 +676,8 @@ charclass_index (charclass const s) } /* Syntax bits controlling the behavior of the lexical analyzer. */ -static reg_syntax_t syntax_bits, syntax_bits_set; +static reg_syntax_t syntax_bits; +static bool syntax_bits_set; /* Flag for case-folding letters into sets. */ static bool case_fold; @@ -686,6 +688,10 @@ static unsigned char eolbyte; /* Cache of char-context values. */ static int sbit[NOTCHAR]; +/* If never_trail[B], the byte B cannot be a non-initial byte in a + multibyte character. */ +static bool never_trail[NOTCHAR]; + /* Set of characters considered letters. */ static charclass letters; @@ -720,12 +726,12 @@ wchar_context (wint_t wc) /* Entry point to set syntax options. */ void -dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) +dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol) { int i; - syntax_bits_set = 1; + syntax_bits_set = true; syntax_bits = bits; - case_fold = fold != 0; + case_fold = fold; eolbyte = eol; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) @@ -747,6 +753,11 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) setbit (uc, newline); break; } + + /* POSIX requires that the five bytes in "\n\r./" (including the + terminating NUL) cannot occur inside a multibyte character. */ + never_trail[uc] = (using_utf8 () ? (uc & 0xc0) != 0x80 + : strchr ("\n\r./", uc) != NULL); } } @@ -782,7 +793,7 @@ setbit_case_fold_c (int b, charclass c) /* UTF-8 encoding allows some optimizations that we can't otherwise assume in a multibyte encoding. */ -int +bool using_utf8 (void) { static int utf8 = -1; @@ -887,7 +898,7 @@ static wint_t wctok; /* Wide character representation of the current lexptr += nbytes; \ lexleft -= nbytes; \ } \ - } while (0) + } while (false) #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) @@ -1240,7 +1251,7 @@ parse_bracket_exp (void) lexptr = lexptr_saved; \ lexleft = lexleft_saved; \ } \ - while (0) + while (false) static token lex (void) @@ -1942,7 +1953,7 @@ regexp (void) /* Main entry point for the parser. S is a string to be parsed, len is the length of the string, so s can include NUL characters. D is a pointer to the struct dfa to parse into. */ -void +static void dfaparse (char const *s, size_t len, struct dfa *d) { dfa = d; @@ -2321,8 +2332,8 @@ state_separate_contexts (position_set const *s) Sets are stored as arrays of the elements, obeying a stack-like allocation scheme; the number of elements in each set deeper in the stack can be used to determine the address of a particular set's array. */ -void -dfaanalyze (struct dfa *d, int searchflag) +static void +dfaanalyze (struct dfa *d, bool searchflag) { /* Array allocated to hold position sets. */ position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc); @@ -2358,7 +2369,7 @@ dfaanalyze (struct dfa *d, int searchflag) putc ('\n', stderr); #endif - d->searchflag = searchflag != 0; + d->searchflag = searchflag; alloc_position_set (&merged, d->nleaves); d->follows = xcalloc (d->tindex, sizeof *d->follows); @@ -2567,7 +2578,7 @@ dfaanalyze (struct dfa *d, int searchflag) If after comparing with every group there are characters remaining in C, create a new group labeled with the characters of C and insert this position in that group. */ -void +static void dfastate (state_num s, struct dfa *d, state_num trans[]) { leaf_set grps[NOTCHAR]; /* As many as will ever be needed. */ @@ -3198,15 +3209,20 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, that are not a single byte character nor the first byte of a multibyte character. - Given DFA state d, use mbs_to_wchar to advance MBP until it reaches or - exceeds P. If WCP is non-NULL, set *WCP to the final wide character - processed, or if no wide character is processed, set it to WEOF. + Given DFA state d, use mbs_to_wchar to advance MBP until it reaches + or exceeds P, and return the advanced MBP. If WCP is non-NULL and + the result is greater than P, set *WCP to the final wide character + processed, or to WEOF if no wide character is processed. Otherwise, + if WCP is non-NULL, *WCP may or may not be updated. + Both P and MBP must be no larger than END. */ static unsigned char const * skip_remains_mb (struct dfa *d, unsigned char const *p, unsigned char const *mbp, char const *end, wint_t *wcp) { wint_t wc = WEOF; + if (never_trail[*p]) + return p; while (mbp < p) mbp += mbs_to_wchar (&wc, (char const *) mbp, end - (char const *) mbp, d); @@ -3235,7 +3251,7 @@ skip_remains_mb (struct dfa *d, unsigned char const *p, - word-delimiter-in-MB-locale: \<, \>, \b */ static inline char * -dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, +dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, size_t *count, bool multibyte) { state_num s, s1; /* Current state. */ @@ -3351,7 +3367,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, \ mbp = p; \ trans = d->trans; \ - } while (0) + } while (false) State_transition(); } @@ -3427,14 +3443,14 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, static char * dfaexec_mb (struct dfa *d, char const *begin, char *end, - int allow_nl, size_t *count, int *backref) + bool allow_nl, size_t *count, bool *backref) { return dfaexec_main (d, begin, end, allow_nl, count, true); } static char * dfaexec_sb (struct dfa *d, char const *begin, char *end, - int allow_nl, size_t *count, int *backref) + bool allow_nl, size_t *count, bool *backref) { return dfaexec_main (d, begin, end, allow_nl, count, false); } @@ -3443,9 +3459,9 @@ dfaexec_sb (struct dfa *d, char const *begin, char *end, any regexp that uses a construct not supported by this code. */ static char * dfaexec_noop (struct dfa *d, char const *begin, char *end, - int allow_nl, size_t *count, int *backref) + bool allow_nl, size_t *count, bool *backref) { - *backref = 1; + *backref = true; return (char *) begin; } @@ -3454,7 +3470,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end, char * dfaexec (struct dfa *d, char const *begin, char *end, - int allow_nl, size_t *count, int *backref) + bool allow_nl, size_t *count, bool *backref) { return d->dfaexec (d, begin, end, allow_nl, count, backref); } @@ -3492,7 +3508,7 @@ free_mbdata (struct dfa *d) /* Initialize the components of a dfa that the other routines don't initialize for themselves. */ -void +static void dfainit (struct dfa *d) { memset (d, 0, sizeof *d); @@ -3648,7 +3664,7 @@ dfassbuild (struct dfa *d) /* Parse and analyze a single string of the given length. */ void -dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) +dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag) { dfainit (d); dfaparse (s, len, d); @@ -57,12 +57,12 @@ extern void dfamustfree (struct dfamust *); /* dfasyntax() takes three arguments; the first sets the syntax bits described earlier in this file, the second sets the case-folding flag, and the third specifies the line terminator. */ -extern void dfasyntax (reg_syntax_t, int, unsigned char); +extern void dfasyntax (reg_syntax_t, bool, unsigned char); /* Compile the given string of the given length into the given struct dfa. Final argument is a flag specifying whether to build a searching or an exact matcher. */ -extern void dfacomp (char const *, size_t, struct dfa *, int); +extern void dfacomp (char const *, size_t, struct dfa *, bool); /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the @@ -71,13 +71,13 @@ extern void dfacomp (char const *, size_t, struct dfa *, int); points to the beginning of the buffer, and END points to the first byte after its end. Note however that we store a sentinel byte (usually newline) in *END, so the actual buffer must be one byte longer. - When NEWLINE is nonzero, newlines may appear in the matching string. + When ALLOW_NL is true, newlines may appear in the matching string. If COUNT is non-NULL, increment *COUNT once for each newline processed. Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we - encountered a back-reference (1) or not (0). The caller may use this - to decide whether to fall back on a backtracking matcher. */ + encountered a back-reference. The caller can use this to decide + whether to fall back on a backtracking matcher. */ extern char *dfaexec (struct dfa *d, char const *begin, char *end, - int newline, size_t *count, int *backref); + bool allow_nl, size_t *count, bool *backref); /* Return a superset for D. The superset matches everything that D matches, along with some other strings (though the latter should be @@ -91,22 +91,6 @@ extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE; /* Free the storage held by the components of a struct dfa. */ extern void dfafree (struct dfa *); -/* Entry points for people who know what they're doing. */ - -/* Initialize the components of a struct dfa. */ -extern void dfainit (struct dfa *); - -/* Incrementally parse a string of given length into a struct dfa. */ -extern void dfaparse (char const *, size_t, struct dfa *); - -/* Analyze a parsed regexp; second argument tells whether to build a searching - or an exact matcher. */ -extern void dfaanalyze (struct dfa *, int); - -/* Compute, for each possible character, the transitions out of a given - state, storing them in an array of integers. */ -extern void dfastate (ptrdiff_t, struct dfa *, ptrdiff_t []); - /* Error handling. */ /* dfawarn() is called by the regexp routines whenever a regex is compiled @@ -120,4 +104,4 @@ extern void dfawarn (const char *); The user must supply a dfaerror. */ extern _Noreturn void dfaerror (const char *); -extern int using_utf8 (void); +extern bool using_utf8 (void); @@ -254,7 +254,7 @@ research(Regexp *rp, char *str, int start, size_t len, int flags) { const char *ret = str; - int try_backref = false; + bool try_backref = false; int need_start; int no_bol; int res; |