diff options
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | dfa.c | 103 | ||||
-rw-r--r-- | extension/ChangeLog | 6 | ||||
-rw-r--r-- | extension/revoutput.3am | 4 | ||||
-rw-r--r-- | extension/revoutput.c | 15 |
5 files changed, 84 insertions, 48 deletions
@@ -1,3 +1,7 @@ +2015-08-02 Arnold D. Robbins <arnold@skeeve.com> + + * dfa.c: Sync with GNU grep. Yet again. + 2015-07-21 Arnold D. Robbins <arnold@skeeve.com> * dfa.c: Sync with GNU grep. @@ -309,8 +309,6 @@ typedef struct size_t hash; /* Hash of the positions of this state. */ position_set elems; /* Positions this state could match. */ unsigned char context; /* Context from previous state. */ - bool has_backref; /* This state matches a \<digit>. */ - bool has_mbcset; /* This state matches a MBCSET. */ unsigned short constraint; /* Constraint for this state to accept. */ token first_end; /* Token value of the first END in elems. */ position_set mbps; /* Positions which can match multibyte @@ -2195,8 +2193,6 @@ state_index (struct dfa *d, position_set const *s, int context) alloc_position_set (&d->states[i].elems, s->nelem); copy (s, &d->states[i].elems); d->states[i].context = context; - d->states[i].has_backref = false; - d->states[i].has_mbcset = false; d->states[i].constraint = 0; d->states[i].first_end = 0; d->states[i].mbps.nelem = 0; @@ -2212,10 +2208,7 @@ state_index (struct dfa *d, position_set const *s, int context) d->states[i].first_end = d->tokens[s->elems[j].index]; } else if (d->tokens[s->elems[j].index] == BACKREF) - { - d->states[i].constraint = NO_CONSTRAINT; - d->states[i].has_backref = true; - } + d->states[i].constraint = NO_CONSTRAINT; ++d->sindex; @@ -2674,9 +2667,6 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) if (d->tokens[pos.index] == MBCSET || d->tokens[pos.index] == ANYCHAR) { - /* MB_CUR_MAX > 1 */ - if (d->tokens[pos.index] == MBCSET) - d->states[s].has_mbcset = true; /* ANYCHAR and MBCSET must match with a single character, so we must put it to d->states[s].mbps, which contains the positions which can match with a single character not a byte. */ @@ -3388,15 +3378,18 @@ skip_remains_mb (struct dfa *d, unsigned char const *p, When ALLOW_NL is nonzero, newlines may appear in the matching string. If COUNT is non-NULL, increment *COUNT once for each newline processed. Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we - encountered a back-reference (1) or not (0). The caller may use this - to decide whether to fall back on a backtracking matcher. - - If MULTIBYTE, the input consists of multibyte characters and/or - encoding-error bytes. Otherwise, the input consists of single-byte - characters. */ + encountered a DFA-unfriendly construct. The caller may use this to + decide whether to fall back on a matcher like regex. If MULTIBYTE, + the input consists of multibyte characters and/or encoding-error bytes. + Otherwise, the input consists of single-byte characters. + Here is the list of features that make this DFA matcher punt: + - [M-N]-range-in-MB-locale: regex is up to 25% faster on [a-z] + - back-reference: (.)\1 + - word-delimiter-in-MB-locale: \<, \>, \b + */ static inline char * -dfaexec_main (struct dfa *d, char const *begin, char *end, - int allow_nl, size_t *count, int *backref, bool multibyte) +dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, + size_t *count, bool multibyte) { state_num s, s1; /* Current state. */ unsigned char const *p, *mbp; /* Current input character. */ @@ -3486,16 +3479,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, Use a macro to avoid the risk that they diverge. */ #define State_transition() \ do { \ - /* Falling back to the glibc matcher in this case gives \ - better performance (up to 25% better on [a-z], for \ - example) and enables support for collating symbols and \ - equivalence classes. */ \ - if (d->states[s].has_mbcset && backref) \ - { \ - *backref = 1; \ - goto done; \ - } \ - \ /* Can match with a multibyte character (and multi-character \ collating element). Transition table might be updated. */ \ s = transit_state (d, s, &p, (unsigned char *) end); \ @@ -3569,11 +3552,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, if (d->fails[s]) { if (d->success[s] & sbit[*p]) - { - if (backref) - *backref = d->states[s].has_backref; - goto done; - } + goto done; s1 = s; if (multibyte) @@ -3603,14 +3582,24 @@ static char * dfaexec_mb (struct dfa *d, char const *begin, char *end, int allow_nl, size_t *count, int *backref) { - return dfaexec_main (d, begin, end, allow_nl, count, backref, true); + return dfaexec_main (d, begin, end, allow_nl, count, true); } static char * dfaexec_sb (struct dfa *d, char const *begin, char *end, int allow_nl, size_t *count, int *backref) { - return dfaexec_main (d, begin, end, allow_nl, count, backref, false); + return dfaexec_main (d, begin, end, allow_nl, count, false); +} + +/* Always set *BACKREF and return BEGIN. Use this wrapper for + any regexp that uses a construct not supported by this code. */ +static char * +dfaexec_noop (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + *backref = 1; + return (char *) begin; } /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte), @@ -3676,6 +3665,31 @@ dfainit (struct dfa *d) d->fast = !d->multibyte; } +/* Return true if every construct in D is supported by this DFA matcher. */ +static bool _GL_ATTRIBUTE_PURE +dfa_supported (struct dfa const *d) +{ + size_t i; + for (i = 0; i < d->tindex; i++) + { + switch (d->tokens[i]) + { + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + if (!d->multibyte) + continue; + /* fallthrough */ + + case BACKREF: + case MBCSET: + return false; + } + } + return true; +} + static void dfaoptimize (struct dfa *d) { @@ -3773,10 +3787,8 @@ dfassbuild (struct dfa *d) if (d->multibyte) { /* These constraints aren't supported in a multibyte locale. - Ignore them in the superset DFA, and treat them as - backreferences in the main DFA. */ + Ignore them in the superset DFA. */ sup->tokens[j++] = EMPTY; - d->tokens[i] = BACKREF; break; } default: @@ -3806,8 +3818,17 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) dfambcache (d); dfaparse (s, len, d); dfassbuild (d); - dfaoptimize (d); - dfaanalyze (d, searchflag); + + if (dfa_supported (d)) + { + dfaoptimize (d); + dfaanalyze (d, searchflag); + } + else + { + d->dfaexec = dfaexec_noop; + } + if (d->superset) { d->fast = true; diff --git a/extension/ChangeLog b/extension/ChangeLog index bbdd57fb..3d38072a 100644 --- a/extension/ChangeLog +++ b/extension/ChangeLog @@ -1,3 +1,9 @@ +2015-08-02 Arnold D. Robbins <arnold@skeeve.com> + + * revoutput.c (init_revoutput): Don't install REVOUT if it's + there already. Makes the extension usable with -v. + * revoutput.3am: Add a BUGS section. + 2015-06-17 Andrew J. Schorr <aschorr@telemetry-investments.com> * inplace.3am (BUGS): Document that ACLs are not preserved, and diff --git a/extension/revoutput.3am b/extension/revoutput.3am index 9c8f062f..8620935b 100644 --- a/extension/revoutput.3am +++ b/extension/revoutput.3am @@ -1,4 +1,4 @@ -.TH REVOUTPUT 3am "Jan 15 2013" "Free Software Foundation" "GNU Awk Extension Modules" +.TH REVOUTPUT 3am "Aug 02 2015" "Free Software Foundation" "GNU Awk Extension Modules" .SH NAME revoutput \- Reverse output strings sample extension .SH SYNOPSIS @@ -35,6 +35,8 @@ The output from this program is: dlrow ,olleh .fi .ft R +.SH BUGS +This extension does not affect the default standard output. .SH "SEE ALSO" .IR "GAWK: Effective AWK Programming" , .IR filefuncs (3am), diff --git a/extension/revoutput.c b/extension/revoutput.c index ae4b444a..69257167 100644 --- a/extension/revoutput.c +++ b/extension/revoutput.c @@ -7,7 +7,7 @@ */ /* - * Copyright (C) 2012, 2013 the Free Software Foundation, Inc. + * Copyright (C) 2012, 2013, 2015 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Programming Language. @@ -47,7 +47,7 @@ static const gawk_api_t *api; /* for convenience macros to work */ static awk_ext_id_t *ext_id; -static const char *ext_version = "revoutput extension: version 1.0"; +static const char *ext_version = "revoutput extension: version 1.1"; static awk_bool_t init_revoutput(void); static awk_bool_t (*init_func)(void) = init_revoutput; @@ -120,11 +120,14 @@ init_revoutput() register_output_wrapper(& output_wrapper); - make_number(0.0, & value); /* init to false */ - if (! sym_update("REVOUT", & value)) { - warning(ext_id, _("revoutput: could not initialize REVOUT variable")); + if (! sym_lookup("REVOUT", AWK_SCALAR, & value)) { + /* only install it if not there, e.g. -v REVOUT=1 */ + make_number(0.0, & value); /* init to false */ + if (! sym_update("REVOUT", & value)) { + warning(ext_id, _("revoutput: could not initialize REVOUT variable")); - return awk_false; + return awk_false; + } } return awk_true; |