Sync files from GNULIB.

author: Arnold D. Robbins <arnold@skeeve.com> 2018-09-21 12:44:21 +0300
committer: Arnold D. Robbins <arnold@skeeve.com> 2018-09-21 12:44:21 +0300
commit: f6a89a9b3ecd956e97b61719ea0d634886ace814 (patch)
tree: b1665dbdfe4df1e6517e08ee4da979acda903224
parent: 53175840c18d19259ec11128bdda64cad71ac8dc (diff)
download: egawk-f6a89a9b3ecd956e97b61719ea0d634886ace814.tar.gz
egawk-f6a89a9b3ecd956e97b61719ea0d634886ace814.tar.bz2
egawk-f6a89a9b3ecd956e97b61719ea0d634886ace814.zip
7 files changed, 239 insertions, 82 deletions
diff --git a/missing_d/ChangeLog b/missing_d/ChangeLog
index 2990bd3f..6752ddcb 100644
--- a/missing_d/ChangeLog
+++ b/missing_d/ChangeLog
@@ -1,3 +1,8 @@
+2018-09-21         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* intprops.h, mktime.c, verify.h: Updated from GNULIB.
+	* mktime-internal.h: New file, imported from GNULIB.
+
 2018-09-07         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* intprops.h, mktime.c: Updated from GNULIB.
diff --git a/missing_d/intprops.h b/missing_d/intprops.h
index a4be30b8..9702aec4 100644
--- a/missing_d/intprops.h
+++ b/missing_d/intprops.h
@@ -342,8 +342,8 @@
    Arguments should be free of side effects.  */
 #define _GL_BINARY_OP_OVERFLOW(a, b, op_result_overflow)        \
   op_result_overflow (a, b,                                     \
-                      _GL_INT_MINIMUM ((1 ? 0 : (b)) + (a)),    \
-                      _GL_INT_MAXIMUM ((1 ? 0 : (b)) + (a)))
+                      _GL_INT_MINIMUM (_GL_INT_CONVERT (a, b)), \
+                      _GL_INT_MAXIMUM (_GL_INT_CONVERT (a, b)))
 
 /* Store the low-order bits of A + B, A - B, A * B, respectively, into *R.
    Return 1 if the result overflows.  See above for restrictions.  */
diff --git a/missing_d/mktime.c b/missing_d/mktime.c
index 1a332e14..3ecab0d1 100644
--- a/missing_d/mktime.c
+++ b/missing_d/mktime.c
@@ -76,11 +76,7 @@
 # define NEED_MKTIME_WORKING DEBUG_MKTIME
 #endif
 
-#ifdef _LIBC
-typedef long int mktime_offset_t;
-#else
 # include "mktime-internal.h"
-#endif
 
 #ifndef _LIBC
 static void
@@ -531,7 +527,7 @@ mktime (struct tm *tp)
      be set as if the tzset() function had been called.  */
   __tzset ();
 
-# if defined __LIBC || NEED_MKTIME_WORKING
+# if defined _LIBC || NEED_MKTIME_WORKING
   static mktime_offset_t localtime_offset;
   return __mktime_internal (tp, __localtime_r, &localtime_offset);
 # else
diff --git a/missing_d/verify.h b/missing_d/verify.h
index bc7f99db..3b57ddee 100644
--- a/missing_d/verify.h
+++ b/missing_d/verify.h
@@ -276,7 +276,8 @@ template <int w>
      when 'assume' silences warnings even with older GCCs.  */
 # define assume(R) ((R) ? (void) 0 : __builtin_trap ())
 #else
-# define assume(R) ((void) (0 && (R)))
+  /* Some tools grok NOTREACHED, e.g., Oracle Studio 12.6.  */
+# define assume(R) ((R) ? (void) 0 : /*NOTREACHED*/ (void) 0)
 #endif
 
 /* @assert.h omit end@  */
diff --git a/support/ChangeLog b/support/ChangeLog
index 86e24b36..d426f53e 100644
--- a/support/ChangeLog
+++ b/support/ChangeLog
@@ -1,3 +1,7 @@
+2018-09-21         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* dfa.c, intprops.h: Sync from GNULIB.
+
 2018-09-16         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* Makefile.in: Regenerated, using Automake 1.16.1.
diff --git a/support/dfa.c b/support/dfa.c
index 41df67bf..86eb677e 100644
--- a/support/dfa.c
+++ b/support/dfa.c
@@ -257,43 +257,19 @@ enum
                                    end of input; any value of END or less in
                                    the parse tree is such a symbol.  Accepting
                                    states of the DFA are those that would have
-                                   a transition on END.  */
+                                   a transition on END.  This is -1, not some
+                                   more-negative value, to tweak the speed of
+                                   comparisons to END.  */
 
   /* Ordinary character values are terminal symbols that match themselves.  */
+  /* CSET must come last in the following list of special tokens.  Otherwise,
+     the list order matters only for performance.  Related special tokens
+     should have nearby values so that code like (t == ANYCHAR || t == MBCSET
+     || CSET <= t) can be done with a single machine-level comparison.  */
 
   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
                                    the empty string.  */
 
-  BACKREF,                      /* BACKREF is generated by \<digit>
-                                   or by any other construct that
-                                   is not completely handled.  If the scanner
-                                   detects a transition on backref, it returns
-                                   a kind of "semi-success" indicating that
-                                   the match will have to be verified with
-                                   a backtracking matcher.  */
-
-  BEGLINE,                      /* BEGLINE is a terminal symbol that matches
-                                   the empty string at the beginning of a
-                                   line.  */
-
-  ENDLINE,                      /* ENDLINE is a terminal symbol that matches
-                                   the empty string at the end of a line.  */
-
-  BEGWORD,                      /* BEGWORD is a terminal symbol that matches
-                                   the empty string at the beginning of a
-                                   word.  */
-
-  ENDWORD,                      /* ENDWORD is a terminal symbol that matches
-                                   the empty string at the end of a word.  */
-
-  LIMWORD,                      /* LIMWORD is a terminal symbol that matches
-                                   the empty string at the beginning or the
-                                   end of a word.  */
-
-  NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
-                                   matches the empty string not at
-                                   the beginning or end of a word.  */
-
   QMARK,                        /* QMARK is an operator of one argument that
                                    matches zero or one occurrences of its
                                    argument.  */
@@ -323,15 +299,47 @@ enum
 
   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 
+  WCHAR,                        /* Only returned by lex.  wctok contains
+                                   the wide character representation.  */
   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
                                    a valid multibyte (or single byte) character.
                                    It is used only if MB_CUR_MAX > 1.  */
+  BEG,                          /* BEG is an initial symbol that matches the
+                                   beginning of input.  */
+
+  BEGLINE,                      /* BEGLINE is a terminal symbol that matches
+                                   the empty string at the beginning of a
+                                   line.  */
+
+  ENDLINE,                      /* ENDLINE is a terminal symbol that matches
+                                   the empty string at the end of a line.  */
+
+  BEGWORD,                      /* BEGWORD is a terminal symbol that matches
+                                   the empty string at the beginning of a
+                                   word.  */
+
+  ENDWORD,                      /* ENDWORD is a terminal symbol that matches
+                                   the empty string at the end of a word.  */
+
+  LIMWORD,                      /* LIMWORD is a terminal symbol that matches
+                                   the empty string at the beginning or the
+                                   end of a word.  */
+
+  NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
+                                   matches the empty string not at
+                                   the beginning or end of a word.  */
+
+  BACKREF,                      /* BACKREF is generated by \<digit>
+                                   or by any other construct that
+                                   is not completely handled.  If the scanner
+                                   detects a transition on backref, it returns
+                                   a kind of "semi-success" indicating that
+                                   the match will have to be verified with
+                                   a backtracking matcher.  */
 
   MBCSET,                       /* MBCSET is similar to CSET, but for
                                    multibyte characters.  */
 
-  WCHAR,                        /* Only returned by lex.  wctok contains
-                                   the wide character representation.  */
 
   CSET                          /* CSET and (and any value greater) is a
                                    terminal symbol that matches any of a
@@ -660,9 +668,9 @@ mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 static void
 prtok (token t)
 {
-  if (t < 0)
+  if (t <= END)
     fprintf (stderr, "END");
-  else if (t < NOTCHAR)
+  else if (0 <= t && t < NOTCHAR)
     {
       unsigned int ch = t;
       fprintf (stderr, "0x%02x", ch);
@@ -672,6 +680,9 @@ prtok (token t)
       char const *s;
       switch (t)
         {
+        case BEG:
+          s = "BEG";
+          break;
         case EMPTY:
           s = "EMPTY";
           break;
@@ -1844,7 +1855,30 @@ add_utf8_anychar (struct dfa *dfa)
 static void
 atom (struct dfa *dfa)
 {
-  if (dfa->parse.tok == WCHAR)
+  if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
+      || dfa->parse.tok >= CSET
+      || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
+      || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
+      || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
+      || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
+      || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
+    {
+      if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
+        {
+          /* For UTF-8 expand the period to a series of CSETs that define a
+             valid UTF-8 character.  This avoids using the slow multibyte
+             path.  I'm pretty sure it would be both profitable and correct to
+             do it for any encoding; however, the optimization must be done
+             manually as it is done above in add_utf8_anychar.  So, let's
+             start with UTF-8: it is the most used, and the structure of the
+             encoding makes the correctness more obvious.  */
+          add_utf8_anychar (dfa);
+        }
+      else
+        addtok (dfa, dfa->parse.tok);
+      dfa->parse.tok = lex (dfa);
+    }
+  else if (dfa->parse.tok == WCHAR)
     {
       if (dfa->lex.wctok == WEOF)
         addtok (dfa, BACKREF);
@@ -1867,28 +1901,6 @@ atom (struct dfa *dfa)
 
       dfa->parse.tok = lex (dfa);
     }
-  else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
-    {
-      /* For UTF-8 expand the period to a series of CSETs that define a valid
-         UTF-8 character.  This avoids using the slow multibyte path.  I'm
-         pretty sure it would be both profitable and correct to do it for
-         any encoding; however, the optimization must be done manually as
-         it is done above in add_utf8_anychar.  So, let's start with
-         UTF-8: it is the most used, and the structure of the encoding
-         makes the correctness more obvious.  */
-      add_utf8_anychar (dfa);
-      dfa->parse.tok = lex (dfa);
-    }
-  else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
-           || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
-           || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
-           || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
-           || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
-           || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
-    {
-      addtok (dfa, dfa->parse.tok);
-      dfa->parse.tok = lex (dfa);
-    }
   else if (dfa->parse.tok == LPAREN)
     {
       dfa->parse.tok = lex (dfa);
@@ -2014,6 +2026,9 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   if (!d->syntax.syntax_bits_set)
     dfaerror (_("no syntax specified"));
 
+  if (!d->nregexps)
+    addtok (d, BEG);
+
   d->parse.tok = lex (d);
   d->parse.depth = d->depth;
 
@@ -2132,6 +2147,21 @@ merge (position_set const *s1, position_set const *s2, position_set *m)
   merge_constrained (s1, s2, -1, m);
 }
 
+static void
+merge2 (position_set *dst, position_set const *src, position_set *m)
+{
+  if (src->nelem < 4)
+    {
+      for (ptrdiff_t i = 0; i < src->nelem; ++i)
+        insert (src->elems[i], dst);
+    }
+   else
+    {
+      merge (src, dst, m);
+      copy (m, dst);
+    }
+}
+
 /* Delete a position from a set.  Return the nonzero constraint of the
    deleted position, or zero if there was no such position.  */
 static unsigned int
@@ -2227,7 +2257,7 @@ state_index (struct dfa *d, position_set const *s, int context)
   for (state_num j = 0; j < s->nelem; j++)
     {
       int c = s->elems[j].constraint;
-      if (d->tokens[s->elems[j].index] < 0)
+      if (d->tokens[s->elems[j].index] <= END)
         {
           if (succeeds_in_context (c, context, CTX_ANY))
             constraint |= c;
@@ -2352,6 +2382,126 @@ state_separate_contexts (position_set const *s)
   return separate_contexts;
 }
 
+enum
+{
+  /* Single token is repeated.  It is distinguished from non-repeated.  */
+  OPT_REPEAT = (1 << 0),
+
+  /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
+     node is not merged.  */
+  OPT_LPAREN = (1 << 1),
+
+  /* Multiple branches are joined.  The node is not merged.  */
+  OPT_RPAREN = (1 << 2),
+
+  /* The node is walked.  If the node is found in walking again, OPT_RPAREN
+     flag is turned on. */
+  OPT_WALKED = (1 << 3),
+
+  /* The node is queued.  The node is not queued again.  */
+  OPT_QUEUED = (1 << 4)
+};
+
+static ptrdiff_t
+merge_nfa_state (struct dfa *d, size_t *queue, ptrdiff_t nqueue, size_t tindex,
+                 char *flags, position_set *merged)
+{
+  position_set *follows = d->follows;
+  ptrdiff_t nelem = 0;
+
+  for (ptrdiff_t i = 0; i < follows[tindex].nelem; i++)
+    {
+      size_t dindex = follows[tindex].elems[i].index;
+
+      /* Skip the node as pruned in future.  */
+      unsigned int iconstraint = follows[tindex].elems[i].constraint;
+      if (iconstraint == 0)
+        continue;
+
+      follows[tindex].elems[nelem++] = follows[tindex].elems[i];
+
+      if (tindex < dindex && !(flags[dindex] & OPT_QUEUED))
+        {
+          queue[nqueue++] = dindex;
+          flags[dindex] |= OPT_QUEUED;
+        }
+
+      if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
+        continue;
+
+      for (ptrdiff_t j = i + 1; j < follows[tindex].nelem; j++)
+        {
+          size_t sindex = follows[tindex].elems[j].index;
+
+          if (follows[tindex].elems[j].constraint != iconstraint)
+            continue;
+
+          if (flags[sindex] & (OPT_LPAREN | OPT_RPAREN))
+            continue;
+
+          if (d->tokens[sindex] != d->tokens[dindex])
+            continue;
+
+          if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
+            continue;
+
+          if (flags[sindex] & OPT_REPEAT)
+            delete (sindex, &follows[sindex]);
+
+          merge2 (&follows[dindex], &follows[sindex], merged);
+
+          /* Mark it as pruned in future.  */
+          follows[tindex].elems[j].constraint = 0;
+        }
+    }
+
+  follows[tindex].nelem = nelem;
+
+  return nqueue;
+}
+
+static void
+dfaoptimize (struct dfa *d)
+{
+  char *flags;
+  size_t *queue;
+  ptrdiff_t nqueue;
+  position_set merged0;
+  position_set *merged;
+  ptrdiff_t extra_space = d->tindex + sizeof *queue - 1;
+  extra_space -= extra_space % sizeof *queue;
+
+  queue = xmalloc (d->nleaves * sizeof *queue + extra_space);
+  flags = (char *) (queue + d->nleaves);
+  memset (flags, 0, d->tindex * sizeof *flags);
+
+  for (size_t i = 0; i < d->tindex; i++)
+    {
+      for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)
+        {
+          if (d->follows[i].elems[j].index == i)
+            flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
+          else if (d->follows[i].elems[j].index < i)
+            flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
+          else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
+            flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
+          else
+            flags[d->follows[i].elems[j].index] |= OPT_WALKED;
+        }
+    }
+
+  merged = &merged0;
+  alloc_position_set (merged, d->nleaves);
+
+  nqueue = 0;
+  queue[nqueue++] = 0;
+
+  for (ptrdiff_t i = 0; i < nqueue; i++)
+    nqueue = merge_nfa_state (d, queue, nqueue, queue[i], flags, merged);
+
+  free (merged->elems);
+  free (queue);
+}
 
 /* Perform bottom-up analysis on the parse tree, computing various functions.
    Note that at this point, we're pretending constructs like \< are real
@@ -2427,6 +2577,8 @@ dfaanalyze (struct dfa *d, bool searchflag)
 
   position_set merged;          /* Result of merging sets.  */
 
+  addtok (d, CAT);
+
 #ifdef DEBUG
   fprintf (stderr, "dfaanalyze:\n");
   for (size_t i = 0; i < d->tindex; ++i)
@@ -2569,6 +2721,8 @@ dfaanalyze (struct dfa *d, bool searchflag)
 #endif
     }
 
+  dfaoptimize (d);
+
 #ifdef DEBUG
   for (size_t i = 0; i < d->tindex; ++i)
     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
@@ -2587,26 +2741,21 @@ dfaanalyze (struct dfa *d, bool searchflag)
       }
 #endif
 
-  /* Get the epsilon closure of the firstpos of the regexp.  The result will
-     be the set of positions of state 0.  */
-  merged.nelem = 0;
-  for (size_t i = 0; i < stk[-1].nfirstpos; ++i)
-    insert (firstpos[i], &merged);
 
   /* For each follow set that is the follow set of a real position, replace
      it with its epsilon closure.  */
-  epsclosure (&merged, d);
+  epsclosure (&d->follows[0], d);
 
   /* Context wanted by some position.  */
-  int separate_contexts = state_separate_contexts (&merged);
+  int separate_contexts = state_separate_contexts (&d->follows[0]);
 
   /* Build the initial state.  */
   if (separate_contexts & CTX_NEWLINE)
-    state_index (d, &merged, CTX_NEWLINE);
+    state_index (d, &d->follows[0], CTX_NEWLINE);
   d->initstate_notbol = d->min_trcount
-    = state_index (d, &merged, separate_contexts ^ CTX_ANY);
+    = state_index (d, &d->follows[0], separate_contexts ^ CTX_ANY);
   if (separate_contexts & CTX_LETTER)
-    d->min_trcount = state_index (d, &merged, CTX_LETTER);
+    d->min_trcount = state_index (d, &d->follows[0], CTX_LETTER);
   d->min_trcount++;
   d->trcount = 0;
 
@@ -3394,8 +3543,10 @@ dfa_supported (struct dfa const *d)
   return true;
 }
 
+/* Disable use of the superset DFA if it is not likely to help
+   performance.  */
 static void
-dfaoptimize (struct dfa *d)
+maybe_disable_superset_dfa (struct dfa *d)
 {
   if (!d->localeinfo.using_utf8)
     return;
@@ -3523,7 +3674,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag)
 
   if (dfa_supported (d))
     {
-      dfaoptimize (d);
+      maybe_disable_superset_dfa (d);
       dfaanalyze (d, searchflag);
     }
   else
@@ -3830,7 +3981,7 @@ dfamust (struct dfa const *d)
   bool need_endline = false;
   bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
 
-  for (size_t ri = 0; ri < d->tindex; ++ri)
+  for (size_t ri = 1; ri + 1 < d->tindex; ri++)
     {
       token t = d->tokens[ri];
       switch (t)
diff --git a/support/intprops.h b/support/intprops.h
index a4be30b8..9702aec4 100644
--- a/support/intprops.h
+++ b/support/intprops.h
@@ -342,8 +342,8 @@
    Arguments should be free of side effects.  */
 #define _GL_BINARY_OP_OVERFLOW(a, b, op_result_overflow)        \
   op_result_overflow (a, b,                                     \
-                      _GL_INT_MINIMUM ((1 ? 0 : (b)) + (a)),    \
-                      _GL_INT_MAXIMUM ((1 ? 0 : (b)) + (a)))
+                      _GL_INT_MINIMUM (_GL_INT_CONVERT (a, b)), \
+                      _GL_INT_MAXIMUM (_GL_INT_CONVERT (a, b)))
 
 /* Store the low-order bits of A + B, A - B, A * B, respectively, into *R.
    Return 1 if the result overflows.  See above for restrictions.  */
author	Arnold D. Robbins <arnold@skeeve.com>	2018-09-21 12:44:21 +0300
committer	Arnold D. Robbins <arnold@skeeve.com>	2018-09-21 12:44:21 +0300
commit	f6a89a9b3ecd956e97b61719ea0d634886ace814 (patch)
tree	b1665dbdfe4df1e6517e08ee4da979acda903224
parent	53175840c18d19259ec11128bdda64cad71ac8dc (diff)
download	egawk-f6a89a9b3ecd956e97b61719ea0d634886ace814.tar.gz egawk-f6a89a9b3ecd956e97b61719ea0d634886ace814.tar.bz2 egawk-f6a89a9b3ecd956e97b61719ea0d634886ace814.zip