diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2018-09-21 12:44:21 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2018-09-21 12:44:21 +0300 |
commit | f6a89a9b3ecd956e97b61719ea0d634886ace814 (patch) | |
tree | b1665dbdfe4df1e6517e08ee4da979acda903224 | |
parent | 53175840c18d19259ec11128bdda64cad71ac8dc (diff) | |
download | egawk-f6a89a9b3ecd956e97b61719ea0d634886ace814.tar.gz egawk-f6a89a9b3ecd956e97b61719ea0d634886ace814.tar.bz2 egawk-f6a89a9b3ecd956e97b61719ea0d634886ace814.zip |
Sync files from GNULIB.
-rw-r--r-- | missing_d/ChangeLog | 5 | ||||
-rw-r--r-- | missing_d/intprops.h | 4 | ||||
-rw-r--r-- | missing_d/mktime.c | 6 | ||||
-rw-r--r-- | missing_d/verify.h | 3 | ||||
-rw-r--r-- | support/ChangeLog | 4 | ||||
-rw-r--r-- | support/dfa.c | 295 | ||||
-rw-r--r-- | support/intprops.h | 4 |
7 files changed, 239 insertions, 82 deletions
diff --git a/missing_d/ChangeLog b/missing_d/ChangeLog index 2990bd3f..6752ddcb 100644 --- a/missing_d/ChangeLog +++ b/missing_d/ChangeLog @@ -1,3 +1,8 @@ +2018-09-21 Arnold D. Robbins <arnold@skeeve.com> + + * intprops.h, mktime.c, verify.h: Updated from GNULIB. + * mktime-internal.h: New file, imported from GNULIB. + 2018-09-07 Arnold D. Robbins <arnold@skeeve.com> * intprops.h, mktime.c: Updated from GNULIB. diff --git a/missing_d/intprops.h b/missing_d/intprops.h index a4be30b8..9702aec4 100644 --- a/missing_d/intprops.h +++ b/missing_d/intprops.h @@ -342,8 +342,8 @@ Arguments should be free of side effects. */ #define _GL_BINARY_OP_OVERFLOW(a, b, op_result_overflow) \ op_result_overflow (a, b, \ - _GL_INT_MINIMUM ((1 ? 0 : (b)) + (a)), \ - _GL_INT_MAXIMUM ((1 ? 0 : (b)) + (a))) + _GL_INT_MINIMUM (_GL_INT_CONVERT (a, b)), \ + _GL_INT_MAXIMUM (_GL_INT_CONVERT (a, b))) /* Store the low-order bits of A + B, A - B, A * B, respectively, into *R. Return 1 if the result overflows. See above for restrictions. */ diff --git a/missing_d/mktime.c b/missing_d/mktime.c index 1a332e14..3ecab0d1 100644 --- a/missing_d/mktime.c +++ b/missing_d/mktime.c @@ -76,11 +76,7 @@ # define NEED_MKTIME_WORKING DEBUG_MKTIME #endif -#ifdef _LIBC -typedef long int mktime_offset_t; -#else # include "mktime-internal.h" -#endif #ifndef _LIBC static void @@ -531,7 +527,7 @@ mktime (struct tm *tp) be set as if the tzset() function had been called. */ __tzset (); -# if defined __LIBC || NEED_MKTIME_WORKING +# if defined _LIBC || NEED_MKTIME_WORKING static mktime_offset_t localtime_offset; return __mktime_internal (tp, __localtime_r, &localtime_offset); # else diff --git a/missing_d/verify.h b/missing_d/verify.h index bc7f99db..3b57ddee 100644 --- a/missing_d/verify.h +++ b/missing_d/verify.h @@ -276,7 +276,8 @@ template <int w> when 'assume' silences warnings even with older GCCs. */ # define assume(R) ((R) ? (void) 0 : __builtin_trap ()) #else -# define assume(R) ((void) (0 && (R))) + /* Some tools grok NOTREACHED, e.g., Oracle Studio 12.6. */ +# define assume(R) ((R) ? (void) 0 : /*NOTREACHED*/ (void) 0) #endif /* @assert.h omit end@ */ diff --git a/support/ChangeLog b/support/ChangeLog index 86e24b36..d426f53e 100644 --- a/support/ChangeLog +++ b/support/ChangeLog @@ -1,3 +1,7 @@ +2018-09-21 Arnold D. Robbins <arnold@skeeve.com> + + * dfa.c, intprops.h: Sync from GNULIB. + 2018-09-16 Arnold D. Robbins <arnold@skeeve.com> * Makefile.in: Regenerated, using Automake 1.16.1. diff --git a/support/dfa.c b/support/dfa.c index 41df67bf..86eb677e 100644 --- a/support/dfa.c +++ b/support/dfa.c @@ -257,43 +257,19 @@ enum end of input; any value of END or less in the parse tree is such a symbol. Accepting states of the DFA are those that would have - a transition on END. */ + a transition on END. This is -1, not some + more-negative value, to tweak the speed of + comparisons to END. */ /* Ordinary character values are terminal symbols that match themselves. */ + /* CSET must come last in the following list of special tokens. Otherwise, + the list order matters only for performance. Related special tokens + should have nearby values so that code like (t == ANYCHAR || t == MBCSET + || CSET <= t) can be done with a single machine-level comparison. */ EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches the empty string. */ - BACKREF, /* BACKREF is generated by \<digit> - or by any other construct that - is not completely handled. If the scanner - detects a transition on backref, it returns - a kind of "semi-success" indicating that - the match will have to be verified with - a backtracking matcher. */ - - BEGLINE, /* BEGLINE is a terminal symbol that matches - the empty string at the beginning of a - line. */ - - ENDLINE, /* ENDLINE is a terminal symbol that matches - the empty string at the end of a line. */ - - BEGWORD, /* BEGWORD is a terminal symbol that matches - the empty string at the beginning of a - word. */ - - ENDWORD, /* ENDWORD is a terminal symbol that matches - the empty string at the end of a word. */ - - LIMWORD, /* LIMWORD is a terminal symbol that matches - the empty string at the beginning or the - end of a word. */ - - NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that - matches the empty string not at - the beginning or end of a word. */ - QMARK, /* QMARK is an operator of one argument that matches zero or one occurrences of its argument. */ @@ -323,15 +299,47 @@ enum RPAREN, /* RPAREN never appears in the parse tree. */ + WCHAR, /* Only returned by lex. wctok contains + the wide character representation. */ ANYCHAR, /* ANYCHAR is a terminal symbol that matches a valid multibyte (or single byte) character. It is used only if MB_CUR_MAX > 1. */ + BEG, /* BEG is an initial symbol that matches the + beginning of input. */ + + BEGLINE, /* BEGLINE is a terminal symbol that matches + the empty string at the beginning of a + line. */ + + ENDLINE, /* ENDLINE is a terminal symbol that matches + the empty string at the end of a line. */ + + BEGWORD, /* BEGWORD is a terminal symbol that matches + the empty string at the beginning of a + word. */ + + ENDWORD, /* ENDWORD is a terminal symbol that matches + the empty string at the end of a word. */ + + LIMWORD, /* LIMWORD is a terminal symbol that matches + the empty string at the beginning or the + end of a word. */ + + NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that + matches the empty string not at + the beginning or end of a word. */ + + BACKREF, /* BACKREF is generated by \<digit> + or by any other construct that + is not completely handled. If the scanner + detects a transition on backref, it returns + a kind of "semi-success" indicating that + the match will have to be verified with + a backtracking matcher. */ MBCSET, /* MBCSET is similar to CSET, but for multibyte characters. */ - WCHAR, /* Only returned by lex. wctok contains - the wide character representation. */ CSET /* CSET and (and any value greater) is a terminal symbol that matches any of a @@ -660,9 +668,9 @@ mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) static void prtok (token t) { - if (t < 0) + if (t <= END) fprintf (stderr, "END"); - else if (t < NOTCHAR) + else if (0 <= t && t < NOTCHAR) { unsigned int ch = t; fprintf (stderr, "0x%02x", ch); @@ -672,6 +680,9 @@ prtok (token t) char const *s; switch (t) { + case BEG: + s = "BEG"; + break; case EMPTY: s = "EMPTY"; break; @@ -1844,7 +1855,30 @@ add_utf8_anychar (struct dfa *dfa) static void atom (struct dfa *dfa) { - if (dfa->parse.tok == WCHAR) + if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR) + || dfa->parse.tok >= CSET + || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF + || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE + || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD + || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD + || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET) + { + if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) + { + /* For UTF-8 expand the period to a series of CSETs that define a + valid UTF-8 character. This avoids using the slow multibyte + path. I'm pretty sure it would be both profitable and correct to + do it for any encoding; however, the optimization must be done + manually as it is done above in add_utf8_anychar. So, let's + start with UTF-8: it is the most used, and the structure of the + encoding makes the correctness more obvious. */ + add_utf8_anychar (dfa); + } + else + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); + } + else if (dfa->parse.tok == WCHAR) { if (dfa->lex.wctok == WEOF) addtok (dfa, BACKREF); @@ -1867,28 +1901,6 @@ atom (struct dfa *dfa) dfa->parse.tok = lex (dfa); } - else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) - { - /* For UTF-8 expand the period to a series of CSETs that define a valid - UTF-8 character. This avoids using the slow multibyte path. I'm - pretty sure it would be both profitable and correct to do it for - any encoding; however, the optimization must be done manually as - it is done above in add_utf8_anychar. So, let's start with - UTF-8: it is the most used, and the structure of the encoding - makes the correctness more obvious. */ - add_utf8_anychar (dfa); - dfa->parse.tok = lex (dfa); - } - else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR) - || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF - || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE - || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR - || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD - || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD) - { - addtok (dfa, dfa->parse.tok); - dfa->parse.tok = lex (dfa); - } else if (dfa->parse.tok == LPAREN) { dfa->parse.tok = lex (dfa); @@ -2014,6 +2026,9 @@ dfaparse (char const *s, size_t len, struct dfa *d) if (!d->syntax.syntax_bits_set) dfaerror (_("no syntax specified")); + if (!d->nregexps) + addtok (d, BEG); + d->parse.tok = lex (d); d->parse.depth = d->depth; @@ -2132,6 +2147,21 @@ merge (position_set const *s1, position_set const *s2, position_set *m) merge_constrained (s1, s2, -1, m); } +static void +merge2 (position_set *dst, position_set const *src, position_set *m) +{ + if (src->nelem < 4) + { + for (ptrdiff_t i = 0; i < src->nelem; ++i) + insert (src->elems[i], dst); + } + else + { + merge (src, dst, m); + copy (m, dst); + } +} + /* Delete a position from a set. Return the nonzero constraint of the deleted position, or zero if there was no such position. */ static unsigned int @@ -2227,7 +2257,7 @@ state_index (struct dfa *d, position_set const *s, int context) for (state_num j = 0; j < s->nelem; j++) { int c = s->elems[j].constraint; - if (d->tokens[s->elems[j].index] < 0) + if (d->tokens[s->elems[j].index] <= END) { if (succeeds_in_context (c, context, CTX_ANY)) constraint |= c; @@ -2352,6 +2382,126 @@ state_separate_contexts (position_set const *s) return separate_contexts; } +enum +{ + /* Single token is repeated. It is distinguished from non-repeated. */ + OPT_REPEAT = (1 << 0), + + /* Multiple tokens are repeated. This flag is on at head of tokens. The + node is not merged. */ + OPT_LPAREN = (1 << 1), + + /* Multiple branches are joined. The node is not merged. */ + OPT_RPAREN = (1 << 2), + + /* The node is walked. If the node is found in walking again, OPT_RPAREN + flag is turned on. */ + OPT_WALKED = (1 << 3), + + /* The node is queued. The node is not queued again. */ + OPT_QUEUED = (1 << 4) +}; + +static ptrdiff_t +merge_nfa_state (struct dfa *d, size_t *queue, ptrdiff_t nqueue, size_t tindex, + char *flags, position_set *merged) +{ + position_set *follows = d->follows; + ptrdiff_t nelem = 0; + + for (ptrdiff_t i = 0; i < follows[tindex].nelem; i++) + { + size_t dindex = follows[tindex].elems[i].index; + + /* Skip the node as pruned in future. */ + unsigned int iconstraint = follows[tindex].elems[i].constraint; + if (iconstraint == 0) + continue; + + follows[tindex].elems[nelem++] = follows[tindex].elems[i]; + + if (tindex < dindex && !(flags[dindex] & OPT_QUEUED)) + { + queue[nqueue++] = dindex; + flags[dindex] |= OPT_QUEUED; + } + + if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN)) + continue; + + for (ptrdiff_t j = i + 1; j < follows[tindex].nelem; j++) + { + size_t sindex = follows[tindex].elems[j].index; + + if (follows[tindex].elems[j].constraint != iconstraint) + continue; + + if (flags[sindex] & (OPT_LPAREN | OPT_RPAREN)) + continue; + + if (d->tokens[sindex] != d->tokens[dindex]) + continue; + + if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT) + continue; + + if (flags[sindex] & OPT_REPEAT) + delete (sindex, &follows[sindex]); + + merge2 (&follows[dindex], &follows[sindex], merged); + + /* Mark it as pruned in future. */ + follows[tindex].elems[j].constraint = 0; + } + } + + follows[tindex].nelem = nelem; + + return nqueue; +} + +static void +dfaoptimize (struct dfa *d) +{ + char *flags; + size_t *queue; + ptrdiff_t nqueue; + position_set merged0; + position_set *merged; + ptrdiff_t extra_space = d->tindex + sizeof *queue - 1; + extra_space -= extra_space % sizeof *queue; + + queue = xmalloc (d->nleaves * sizeof *queue + extra_space); + flags = (char *) (queue + d->nleaves); + memset (flags, 0, d->tindex * sizeof *flags); + + for (size_t i = 0; i < d->tindex; i++) + { + for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++) + { + if (d->follows[i].elems[j].index == i) + flags[d->follows[i].elems[j].index] |= OPT_REPEAT; + else if (d->follows[i].elems[j].index < i) + flags[d->follows[i].elems[j].index] |= OPT_LPAREN; + else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED) + flags[d->follows[i].elems[j].index] |= OPT_RPAREN; + else + flags[d->follows[i].elems[j].index] |= OPT_WALKED; + } + } + + merged = &merged0; + alloc_position_set (merged, d->nleaves); + + nqueue = 0; + queue[nqueue++] = 0; + + for (ptrdiff_t i = 0; i < nqueue; i++) + nqueue = merge_nfa_state (d, queue, nqueue, queue[i], flags, merged); + + free (merged->elems); + free (queue); +} /* Perform bottom-up analysis on the parse tree, computing various functions. Note that at this point, we're pretending constructs like \< are real @@ -2427,6 +2577,8 @@ dfaanalyze (struct dfa *d, bool searchflag) position_set merged; /* Result of merging sets. */ + addtok (d, CAT); + #ifdef DEBUG fprintf (stderr, "dfaanalyze:\n"); for (size_t i = 0; i < d->tindex; ++i) @@ -2569,6 +2721,8 @@ dfaanalyze (struct dfa *d, bool searchflag) #endif } + dfaoptimize (d); + #ifdef DEBUG for (size_t i = 0; i < d->tindex; ++i) if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF @@ -2587,26 +2741,21 @@ dfaanalyze (struct dfa *d, bool searchflag) } #endif - /* Get the epsilon closure of the firstpos of the regexp. The result will - be the set of positions of state 0. */ - merged.nelem = 0; - for (size_t i = 0; i < stk[-1].nfirstpos; ++i) - insert (firstpos[i], &merged); /* For each follow set that is the follow set of a real position, replace it with its epsilon closure. */ - epsclosure (&merged, d); + epsclosure (&d->follows[0], d); /* Context wanted by some position. */ - int separate_contexts = state_separate_contexts (&merged); + int separate_contexts = state_separate_contexts (&d->follows[0]); /* Build the initial state. */ if (separate_contexts & CTX_NEWLINE) - state_index (d, &merged, CTX_NEWLINE); + state_index (d, &d->follows[0], CTX_NEWLINE); d->initstate_notbol = d->min_trcount - = state_index (d, &merged, separate_contexts ^ CTX_ANY); + = state_index (d, &d->follows[0], separate_contexts ^ CTX_ANY); if (separate_contexts & CTX_LETTER) - d->min_trcount = state_index (d, &merged, CTX_LETTER); + d->min_trcount = state_index (d, &d->follows[0], CTX_LETTER); d->min_trcount++; d->trcount = 0; @@ -3394,8 +3543,10 @@ dfa_supported (struct dfa const *d) return true; } +/* Disable use of the superset DFA if it is not likely to help + performance. */ static void -dfaoptimize (struct dfa *d) +maybe_disable_superset_dfa (struct dfa *d) { if (!d->localeinfo.using_utf8) return; @@ -3523,7 +3674,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag) if (dfa_supported (d)) { - dfaoptimize (d); + maybe_disable_superset_dfa (d); dfaanalyze (d, searchflag); } else @@ -3830,7 +3981,7 @@ dfamust (struct dfa const *d) bool need_endline = false; bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1; - for (size_t ri = 0; ri < d->tindex; ++ri) + for (size_t ri = 1; ri + 1 < d->tindex; ri++) { token t = d->tokens[ri]; switch (t) diff --git a/support/intprops.h b/support/intprops.h index a4be30b8..9702aec4 100644 --- a/support/intprops.h +++ b/support/intprops.h @@ -342,8 +342,8 @@ Arguments should be free of side effects. */ #define _GL_BINARY_OP_OVERFLOW(a, b, op_result_overflow) \ op_result_overflow (a, b, \ - _GL_INT_MINIMUM ((1 ? 0 : (b)) + (a)), \ - _GL_INT_MAXIMUM ((1 ? 0 : (b)) + (a))) + _GL_INT_MINIMUM (_GL_INT_CONVERT (a, b)), \ + _GL_INT_MAXIMUM (_GL_INT_CONVERT (a, b))) /* Store the low-order bits of A + B, A - B, A * B, respectively, into *R. Return 1 if the result overflows. See above for restrictions. */ |