Merge branch 'gawk-4.0-stable'

author: Arnold D. Robbins <arnold@skeeve.com> 2012-03-22 22:10:13 +0200
committer: Arnold D. Robbins <arnold@skeeve.com> 2012-03-22 22:10:13 +0200
commit: c7f51459f7da3e0c693f8a950726869d99e5bcf2 (patch)
tree: 2e4b54cc695adc5b4fdaec56e237250c030a5419 /dfa.c
parent: 1e495ca269aa2653bf804088ebe532b67110a3ef (diff)
parent: b467a6d3d604723e0c152dceb09e998c059bfa40 (diff)
download: egawk-c7f51459f7da3e0c693f8a950726869d99e5bcf2.tar.gz
egawk-c7f51459f7da3e0c693f8a950726869d99e5bcf2.tar.bz2
egawk-c7f51459f7da3e0c693f8a950726869d99e5bcf2.zip
1 files changed, 243 insertions, 205 deletions
diff --git a/dfa.c b/dfa.c
index acd1a947..64ce8f7c 100644
--- a/dfa.c
+++ b/dfa.c
@@ -1,5 +1,5 @@
 /* dfa.c - deterministic extended regexp routines for GNU
-   Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2011 Free Software
+   Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2012 Free Software
    Foundation, Inc.
 
    This program is free software; you can redistribute it and/or modify
@@ -114,6 +114,19 @@ typedef int charclass[CHARCLASS_INTS];
    errors that the cast doesn't.  */
 static inline unsigned char to_uchar (char ch) { return ch; }
 
+/* Contexts tell us whether a character is a newline or a word constituent.
+   Word-constituent characters are those that satisfy iswalnum(), plus '_'.
+
+   A state also stores a context value, which is nonzero if its
+   predecessors always matches a newline or a word constituent.
+   The definition of a state's context is a bit unclear, but will be
+   modified soon anyway.  */
+
+#define CTX_NONE	1
+#define CTX_LETTER	2
+#define CTX_NEWLINE	4
+#define CTX_ANY		7
+
 /* Sometimes characters can only be matched depending on the surrounding
    context.  Such context decisions depend on what the previous character
    was, and the value of the current (lookahead) character.  Context
@@ -130,20 +143,19 @@ static inline unsigned char to_uchar (char ch) { return ch; }
    bit 1 - previous wasn't word-constituent, current is
    bit 0 - neither previous nor current is word-constituent
 
-   Word-constituent characters are those that satisfy isalnum().
-
    The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
-   succeeds in a particular context.  Prevn is true if the previous character
-   was a newline, currn is true if the lookahead character is a newline.
-   Prevl and currl similarly depend upon whether the previous and current
-   characters are word-constituent letters. */
-#define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
-  ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4))
-#define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \
-  ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0)))
-#define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \
-  (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn)		     \
-   && MATCHES_LETTER_CONTEXT(constraint, prevl, currl))
+   succeeds in a particular context.  Prev is the context value for
+   the previous character, curr is the context value for the lookahead
+   character. */
+#define MATCHES_NEWLINE_CONTEXT(constraint, prev, curr) \
+  ((constraint) & \
+   1 << (((prev & CTX_NEWLINE) ? 2 : 0) + ((curr & CTX_NEWLINE) ? 1 : 0) + 4))
+#define MATCHES_LETTER_CONTEXT(constraint, prev, curr) \
+  ((constraint) & \
+   1 << (((prev & CTX_LETTER) ? 2 : 0) + ((curr & CTX_LETTER) ? 1 : 0)))
+#define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
+  (MATCHES_NEWLINE_CONTEXT(constraint, prev, curr)		     \
+   && MATCHES_LETTER_CONTEXT(constraint, prev, curr))
 
 /* The following macros give information about what a constraint depends on. */
 #define PREV_NEWLINE_DEPENDENT(constraint) \
@@ -269,9 +281,17 @@ typedef struct
 typedef struct
 {
   position *elems;		/* Elements of this position set. */
-  int nelem;			/* Number of elements in this set. */
+  size_t nelem;			/* Number of elements in this set. */
+  size_t alloc;			/* Number of elements allocated in ELEMS.  */
 } position_set;
 
+/* Sets of leaves are also stored as arrays. */
+typedef struct
+{
+  unsigned int *elems;		/* Elements of this position set. */
+  size_t nelem;			/* Number of elements in this set. */
+} leaf_set;
+
 /* A state of the dfa consists of a set of positions, some flags,
    and the token value of the lowest-numbered position of the state that
    contains an END token. */
@@ -279,9 +299,8 @@ typedef struct
 {
   int hash;			/* Hash of the positions of this state. */
   position_set elems;		/* Positions this state could match. */
-  char newline;			/* True if previous state matched newline. */
-  char letter;			/* True if previous state matched a letter. */
-  char backref;			/* True if this state matches a \<digit>. */
+  unsigned char context;	/* Context from previous state. */
+  char backref;			/* True if this state matches a \<digit>.  */
   unsigned char constraint;	/* Constraint for this state to accept. */
   int first_end;		/* Token value of the first END in elems. */
   position_set mbps;           /* Positions which can match multibyte
@@ -413,9 +432,8 @@ struct dfa
 
 /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
    specified context. */
-#define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \
-  SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint,		   \
-                       prevn, currn, prevl, currl)
+#define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
+  SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint, prev, curr)
 
 static void dfamust (struct dfa *dfa);
 static void regexp (void);
@@ -445,7 +463,6 @@ static void regexp (void);
 #define REALLOC_IF_NECESSARY(p, n_alloc, n_required)		\
   do								\
     {								\
-      assert (0 <= (n_required));				\
       if ((n_alloc) <= (n_required))				\
         {							\
           size_t new_n_alloc = (n_required) + !(p);		\
@@ -569,14 +586,72 @@ static int case_fold;
 /* End-of-line byte in data.  */
 static unsigned char eolbyte;
 
+/* Cache of char-context values.  */
+static int sbit[NOTCHAR];
+
+/* Set of characters considered letters. */
+static charclass letters;
+
+/* Set of characters that are newline. */
+static charclass newline;
+
+/* Add this to the test for whether a byte is word-constituent, since on
+   BSD-based systems, many values in the 128..255 range are classified as
+   alphabetic, while on glibc-based systems, they are not.  */
+#ifdef __GLIBC__
+# define is_valid_unibyte_character(c) 1
+#else
+# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
+#endif
+
+/* Return non-zero if C is a 'word-constituent' byte; zero otherwise.  */
+#define IS_WORD_CONSTITUENT(C) \
+  (is_valid_unibyte_character (C) && (isalnum (C) || (C) == '_'))
+
+static int
+char_context (unsigned char c)
+{
+  if (c == eolbyte || c == 0)
+    return CTX_NEWLINE;
+  if (IS_WORD_CONSTITUENT (c))
+    return CTX_LETTER;
+  return CTX_NONE;
+}
+
+static int
+wchar_context(wint_t wc)
+{
+  if (wc == (wchar_t)eolbyte || wc == 0)
+    return CTX_NEWLINE;
+  if (wc == L'_' || iswalnum (wc))
+    return CTX_LETTER;
+  return CTX_NONE;
+}
+
 /* Entry point to set syntax options. */
 void
 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
 {
+  unsigned int i;
+
   syntax_bits_set = 1;
   syntax_bits = bits;
   case_fold = fold;
   eolbyte = eol;
+
+  for (i = 0; i < NOTCHAR; ++i)
+    {
+      sbit[i] = char_context (i);
+      switch (sbit[i])
+        {
+        case CTX_LETTER:
+          setbit (i, letters);
+          break;
+        case CTX_NEWLINE:
+          setbit (i, newline);
+          break;
+        }
+    }
 }
 
 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
@@ -820,7 +895,7 @@ parse_bracket_exp (void)
   int chars_al, range_sts_al, range_ends_al, ch_classes_al,
     equivs_al, coll_elems_al;
 
-  chars_al = 1;
+  chars_al = 0;
   range_sts_al = range_ends_al = 0;
   ch_classes_al = equivs_al = coll_elems_al = 0;
   if (MB_CUR_MAX > 1)
@@ -903,8 +978,6 @@ parse_bracket_exp (void)
                       /* Store the character class as wctype_t.  */
                       wctype_t wt = wctype (class);
 
-                      if (ch_classes_al == 0)
-                        MALLOC(work_mbc->ch_classes, ++ch_classes_al);
                       REALLOC_IF_NECESSARY(work_mbc->ch_classes,
                                            ch_classes_al,
                                            work_mbc->nch_classes + 1);
@@ -925,8 +998,6 @@ parse_bracket_exp (void)
                   if (c1 == '=')
                     /* build equivalent class.  */
                     {
-                      if (equivs_al == 0)
-                        MALLOC(work_mbc->equivs, ++equivs_al);
                       REALLOC_IF_NECESSARY(work_mbc->equivs,
                                            equivs_al,
                                            work_mbc->nequivs + 1);
@@ -936,8 +1007,6 @@ parse_bracket_exp (void)
                   if (c1 == '.')
                     /* build collating element.  */
                     {
-                      if (coll_elems_al == 0)
-                        MALLOC(work_mbc->coll_elems, ++coll_elems_al);
                       REALLOC_IF_NECESSARY(work_mbc->coll_elems,
                                            coll_elems_al,
                                            work_mbc->ncoll_elems + 1);
@@ -984,11 +1053,6 @@ parse_bracket_exp (void)
             {
               /* When case folding map a range, say [m-z] (or even [M-z])
                  to the pair of ranges, [m-z] [M-Z].  */
-              if (range_sts_al == 0)
-                {
-                  MALLOC(work_mbc->range_sts, ++range_sts_al);
-                  MALLOC(work_mbc->range_ends, ++range_ends_al);
-                }
               REALLOC_IF_NECESSARY(work_mbc->range_sts,
                                    range_sts_al, work_mbc->nranges + 1);
               REALLOC_IF_NECESSARY(work_mbc->range_ends,
@@ -1081,19 +1145,6 @@ parse_bracket_exp (void)
   return CSET + charclass_index(ccl);
 }
 
-/* Add this to the test for whether a byte is word-constituent, since on
-   BSD-based systems, many values in the 128..255 range are classified as
-   alphabetic, while on glibc-based systems, they are not.  */
-#ifdef __GLIBC__
-# define is_valid_unibyte_character(c) 1
-#else
-# define is_valid_unibyte_character(c) (MBS_SUPPORT && btowc (c) != WEOF)
-#endif
-
-/* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
-#define IS_WORD_CONSTITUENT(C) \
-  (is_valid_unibyte_character(C) && (isalnum(C) || (C) == '_'))
-
 static token
 lex (void)
 {
@@ -1832,10 +1883,19 @@ dfaparse (char const *s, size_t len, struct dfa *d)
 static void
 copy (position_set const *src, position_set *dst)
 {
+  REALLOC_IF_NECESSARY(dst->elems, dst->alloc, src->nelem);
   memcpy(dst->elems, src->elems, sizeof(dst->elems[0]) * src->nelem);
   dst->nelem = src->nelem;
 }
 
+static void
+alloc_position_set (position_set *s, size_t size)
+{
+  MALLOC(s->elems, size);
+  s->alloc = size;
+  s->nelem = 0;
+}
+
 /* Insert position P in set S.  S is maintained in sorted order on
    decreasing index.  If there is already an entry in S with P.index
    then merge (logically-OR) P's constraints into the one in S.
@@ -1845,6 +1905,7 @@ insert (position p, position_set *s)
 {
   int count = s->nelem;
   int lo = 0, hi = count;
+  int i;
   while (lo < hi)
     {
       int mid = ((unsigned) lo + (unsigned) hi) >> 1;
@@ -1855,15 +1916,16 @@ insert (position p, position_set *s)
     }
 
   if (lo < count && p.index == s->elems[lo].index)
-    s->elems[lo].constraint |= p.constraint;
-  else
     {
-      int i;
-      for (i = count; i > lo; i--)
-        s->elems[i] = s->elems[i - 1];
-      s->elems[lo] = p;
-      ++s->nelem;
+      s->elems[lo].constraint |= p.constraint;
+      return;
     }
+
+  REALLOC_IF_NECESSARY(s->elems, s->alloc, count + 1);
+  for (i = count; i > lo; i--)
+    s->elems[i] = s->elems[i - 1];
+  s->elems[lo] = p;
+  ++s->nelem;
 }
 
 /* Merge two sets of positions into a third.  The result is exactly as if
@@ -1873,6 +1935,7 @@ merge (position_set const *s1, position_set const *s2, position_set *m)
 {
   int i = 0, j = 0;
 
+  REALLOC_IF_NECESSARY(m->elems, m->alloc, s1->nelem + s2->nelem);
   m->nelem = 0;
   while (i < s1->nelem && j < s2->nelem)
     if (s1->elems[i].index > s2->elems[j].index)
@@ -1906,18 +1969,15 @@ delete (position p, position_set *s)
 
 /* Find the index of the state corresponding to the given position set with
    the given preceding context, or create a new state if there is no such
-   state.  Newline and letter tell whether we got here on a newline or
-   letter, respectively. */
+   state.  Context tells whether we got here on a newline or letter. */
 static int
-state_index (struct dfa *d, position_set const *s, int newline, int letter)
+state_index (struct dfa *d, position_set const *s, int context)
 {
   int hash = 0;
   int constraint;
   int i, j;
 
-  newline = newline ? 1 : 0;
-  letter = letter ? 1 : 0;
-
+  context &= ~CTX_NONE;
   for (i = 0; i < s->nelem; ++i)
     hash ^= s->elems[i].index + s->elems[i].constraint;
 
@@ -1925,7 +1985,7 @@ state_index (struct dfa *d, position_set const *s, int newline, int letter)
   for (i = 0; i < d->sindex; ++i)
     {
       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
-          || newline != d->states[i].newline || letter != d->states[i].letter)
+          || context != d->states[i].context)
         continue;
       for (j = 0; j < s->nelem; ++j)
         if (s->elems[j].constraint
@@ -1939,10 +1999,9 @@ state_index (struct dfa *d, position_set const *s, int newline, int letter)
   /* We'll have to create a new state. */
   REALLOC_IF_NECESSARY(d->states, d->salloc, d->sindex + 1);
   d->states[i].hash = hash;
-  MALLOC(d->states[i].elems.elems, s->nelem);
+  alloc_position_set(&d->states[i].elems, s->nelem);
   copy(s, &d->states[i].elems);
-  d->states[i].newline = newline;
-  d->states[i].letter = letter;
+  d->states[i].context = context;
   d->states[i].backref = 0;
   d->states[i].constraint = 0;
   d->states[i].first_end = 0;
@@ -1955,10 +2014,9 @@ state_index (struct dfa *d, position_set const *s, int newline, int letter)
     if (d->tokens[s->elems[j].index] < 0)
       {
         constraint = s->elems[j].constraint;
-        if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
-            || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
-            || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
-            || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
+        if (SUCCEEDS_IN_CONTEXT(constraint, context, CTX_NONE)
+            || SUCCEEDS_IN_CONTEXT(constraint, context, CTX_NEWLINE)
+            || SUCCEEDS_IN_CONTEXT(constraint, context, CTX_LETTER))
           d->states[i].constraint |= constraint;
         if (! d->states[i].first_end)
           d->states[i].first_end = d->tokens[s->elems[j].index];
@@ -2041,6 +2099,55 @@ epsclosure (position_set *s, struct dfa const *d)
   free(visited);
 }
 
+/* Returns the set of contexts for which there is at least one
+   character included in C.  */
+
+static int
+charclass_context(charclass c)
+{
+  int context = 0;
+  unsigned int j;
+
+  if (tstbit(eolbyte, c))
+    context |= CTX_NEWLINE;
+
+  for (j = 0; j < CHARCLASS_INTS; ++j)
+    {
+      if (c[j] & letters[j])
+        context |= CTX_LETTER;
+      if (c[j] & ~(letters[j] | newline[j]))
+        context |= CTX_NONE;
+    }
+
+  return context;
+}
+
+/* Returns the subset of POSSIBLE_CONTEXTS on which the position set S
+   depends.  Each context in the set of returned contexts (let's call it
+   SC) may have a different follow set than other contexts in SC, and
+   also different from the follow set of the complement set.  However,
+   all contexts in the complement set will have the same follow set.  */
+
+static int _GL_ATTRIBUTE_PURE
+state_separate_contexts (position_set *s, int possible_contexts)
+{
+  int separate_context = 0;
+  unsigned int j;
+
+  for (j = 0; j < s->nelem; ++j)
+    {
+      if ((possible_contexts & CTX_NEWLINE)
+          && PREV_NEWLINE_DEPENDENT(s->elems[j].constraint))
+        separate_context |= CTX_NEWLINE;
+      if ((possible_contexts & CTX_LETTER)
+          && PREV_LETTER_DEPENDENT(s->elems[j].constraint))
+        separate_context |= CTX_LETTER;
+    }
+
+  return separate_context;
+}
+
+
 /* Perform bottom-up analysis on the parse tree, computing various functions.
    Note that at this point, we're pretending constructs like \< are real
    characters rather than constraints on what can follow them.
@@ -2101,10 +2208,9 @@ dfaanalyze (struct dfa *d, int searchflag)
   position *firstpos;		/* Array where firstpos elements are stored. */
   int *nlastpos;		/* Element count stack for lastpos sets. */
   position *lastpos;		/* Array where lastpos elements are stored. */
-  int *nalloc;			/* Sizes of arrays allocated to follow sets. */
   position_set tmp;		/* Temporary set for merging sets. */
   position_set merged;		/* Result of merging sets. */
-  int wants_newline;		/* True if some position wants newline info. */
+  int separate_contexts;	/* Context wanted by some position. */
   int *o_nullable;
   int *o_nfirst, *o_nlast;
   position *o_firstpos, *o_lastpos;
@@ -2133,8 +2239,7 @@ dfaanalyze (struct dfa *d, int searchflag)
   o_nlast = nlastpos;
   MALLOC(lastpos, d->nleaves);
   o_lastpos = lastpos, lastpos += d->nleaves;
-  CALLOC(nalloc, d->tindex);
-  MALLOC(merged.elems, d->nleaves);
+  alloc_position_set(&merged, d->nleaves);
 
   CALLOC(d->follows, d->tindex);
 
@@ -2160,8 +2265,6 @@ dfaanalyze (struct dfa *d, int searchflag)
         for (j = 0; j < nlastpos[-1]; ++j)
           {
             merge(&tmp, &d->follows[pos[j].index], &merged);
-            REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems,
-                                 nalloc[pos[j].index], merged.nelem);
             copy(&merged, &d->follows[pos[j].index]);
           }
 
@@ -2180,8 +2283,6 @@ dfaanalyze (struct dfa *d, int searchflag)
         for (j = 0; j < nlastpos[-2]; ++j)
           {
             merge(&tmp, &d->follows[pos[j].index], &merged);
-            REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems,
-                                 nalloc[pos[j].index], merged.nelem);
             copy(&merged, &d->follows[pos[j].index]);
           }
 
@@ -2241,8 +2342,7 @@ dfaanalyze (struct dfa *d, int searchflag)
         firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
 
         /* Allocate the follow set for this position. */
-        nalloc[i] = 1;
-        MALLOC(d->follows[i].elems, nalloc[i]);
+        alloc_position_set(&d->follows[i], 1);
         break;
       }
 #ifdef DEBUG
@@ -2290,8 +2390,6 @@ dfaanalyze (struct dfa *d, int searchflag)
 #endif
         copy(&d->follows[i], &merged);
         epsclosure(&merged, d);
-        if (d->follows[i].nelem < merged.nelem)
-          REALLOC(d->follows[i].elems, merged.nelem);
         copy(&merged, &d->follows[i]);
       }
 
@@ -2302,27 +2400,23 @@ dfaanalyze (struct dfa *d, int searchflag)
     insert(firstpos[i], &merged);
   epsclosure(&merged, d);
 
-  /* Check if any of the positions of state 0 will want newline context. */
-  wants_newline = 0;
-  for (i = 0; i < merged.nelem; ++i)
-    if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
-      wants_newline = 1;
-
   /* Build the initial state. */
   d->salloc = 1;
   d->sindex = 0;
   MALLOC(d->states, d->salloc);
-  state_index(d, &merged, wants_newline, 0);
+
+  separate_contexts = state_separate_contexts(&merged, CTX_NEWLINE);
+  state_index(d, &merged, separate_contexts);
 
   free(o_nullable);
   free(o_nfirst);
   free(o_firstpos);
   free(o_nlast);
   free(o_lastpos);
-  free(nalloc);
   free(merged.elems);
 }
 
+
 /* Find, for each character, the transition out of state s of d, and store
    it in the appropriate slot of trans.
 
@@ -2356,7 +2450,7 @@ dfaanalyze (struct dfa *d, int searchflag)
 void
 dfastate (int s, struct dfa *d, int trans[])
 {
-  position_set *grps;		/* As many as will ever be needed. */
+  leaf_set *grps;		/* As many as will ever be needed. */
   charclass *labels;		/* Labels corresponding to the groups. */
   int ngrps = 0;		/* Number of groups actually used. */
   position pos;			/* Current position being considered. */
@@ -2366,31 +2460,18 @@ dfastate (int s, struct dfa *d, int trans[])
   int intersectf;		/* True if intersect is nonempty. */
   charclass leftovers;		/* Stuff in the label that didn't match. */
   int leftoversf;		/* True if leftovers is nonempty. */
-  static charclass letters;	/* Set of characters considered letters. */
-  static charclass newline;	/* Set of characters that aren't newline. */
   position_set follows;		/* Union of the follows of some group. */
   position_set tmp;		/* Temporary space for merging sets. */
+  int possible_contexts;	/* Contexts that this group can match. */
+  int separate_contexts;	/* Context that new state wants to know. */
   int state;			/* New state. */
-  int wants_newline;		/* New state wants to know newline context. */
   int state_newline;		/* New state on a newline transition. */
-  int wants_letter;		/* New state wants to know letter context. */
   int state_letter;		/* New state on a letter transition. */
-  static int initialized;	/* Flag for static initialization. */
   int next_isnt_1st_byte = 0;	/* Flag if we can't add state0.  */
   int i, j, k;
 
-  grps = xnmalloc (NOTCHAR, sizeof *grps);
-  labels = xnmalloc (NOTCHAR, sizeof *labels);
-
-  /* Initialize the set of letters, if necessary. */
-  if (! initialized)
-    {
-      initialized = 1;
-      for (i = 0; i < NOTCHAR; ++i)
-        if (IS_WORD_CONSTITUENT(i))
-          setbit(i, letters);
-      setbit(eolbyte, newline);
-    }
+  MALLOC (grps, NOTCHAR);
+  MALLOC (labels, NOTCHAR);
 
   zeroset(matches);
 
@@ -2410,9 +2491,7 @@ dfastate (int s, struct dfa *d, int trans[])
              must put it to d->states[s].mbps, which contains the positions
              which can match with a single character not a byte.  */
           if (d->states[s].mbps.nelem == 0)
-            {
-              MALLOC(d->states[s].mbps.elems, d->states[s].elems.nelem);
-            }
+            alloc_position_set(&d->states[s].mbps, 1);
           insert(pos, &(d->states[s].mbps));
           continue;
         }
@@ -2424,18 +2503,20 @@ dfastate (int s, struct dfa *d, int trans[])
       if (pos.constraint != 0xFF)
         {
           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
-                                         d->states[s].newline, 1))
+                                        d->states[s].context & CTX_NEWLINE,
+                                        CTX_NEWLINE))
             clrbit(eolbyte, matches);
           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
-                                         d->states[s].newline, 0))
+                                        d->states[s].context & CTX_NEWLINE, 0))
             for (j = 0; j < CHARCLASS_INTS; ++j)
               matches[j] &= newline[j];
           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
-                                        d->states[s].letter, 1))
+                                       d->states[s].context & CTX_LETTER,
+                                       CTX_LETTER))
             for (j = 0; j < CHARCLASS_INTS; ++j)
               matches[j] &= ~letters[j];
           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
-                                        d->states[s].letter, 0))
+                                       d->states[s].context & CTX_LETTER, 0))
             for (j = 0; j < CHARCLASS_INTS; ++j)
               matches[j] &= letters[j];
 
@@ -2480,13 +2561,15 @@ dfastate (int s, struct dfa *d, int trans[])
               copyset(leftovers, labels[ngrps]);
               copyset(intersect, labels[j]);
               MALLOC(grps[ngrps].elems, d->nleaves);
-              copy(&grps[j], &grps[ngrps]);
+              memcpy(grps[ngrps].elems, grps[j].elems,
+                     sizeof (grps[j].elems[0]) * grps[j].nelem);
+              grps[ngrps].nelem = grps[j].nelem;
               ++ngrps;
             }
 
-          /* Put the position in the current group.  Note that there is no
-             reason to call insert() here. */
-          grps[j].elems[grps[j].nelem++] = pos;
+          /* Put the position in the current group.  The constraint is
+             irrelevant here.  */
+          grps[j].elems[grps[j].nelem++] = pos.index;
 
           /* If every character matching the current position has been
              accounted for, we're done. */
@@ -2502,38 +2585,32 @@ dfastate (int s, struct dfa *d, int trans[])
           zeroset(matches);
           MALLOC(grps[ngrps].elems, d->nleaves);
           grps[ngrps].nelem = 1;
-          grps[ngrps].elems[0] = pos;
+          grps[ngrps].elems[0] = pos.index;
           ++ngrps;
         }
     }
 
-  MALLOC(follows.elems, d->nleaves);
-  MALLOC(tmp.elems, d->nleaves);
+  alloc_position_set(&follows, d->nleaves);
+  alloc_position_set(&tmp, d->nleaves);
 
   /* If we are a searching matcher, the default transition is to a state
      containing the positions of state 0, otherwise the default transition
      is to fail miserably. */
   if (d->searchflag)
     {
-      wants_newline = 0;
-      wants_letter = 0;
-      for (i = 0; i < d->states[0].elems.nelem; ++i)
-        {
-          if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
-            wants_newline = 1;
-          if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
-            wants_letter = 1;
-        }
+      /* Find the state(s) corresponding to the positions of state 0. */
       copy(&d->states[0].elems, &follows);
-      state = state_index(d, &follows, 0, 0);
-      if (wants_newline)
-        state_newline = state_index(d, &follows, 1, 0);
+      separate_contexts = state_separate_contexts(&follows, CTX_ANY);
+      state = state_index(d, &follows, 0);
+      if (separate_contexts & CTX_NEWLINE)
+        state_newline = state_index(d, &follows, CTX_NEWLINE);
       else
         state_newline = state;
-      if (wants_letter)
-        state_letter = state_index(d, &follows, 0, 1);
+      if (separate_contexts & CTX_LETTER)
+        state_letter = state_index(d, &follows, CTX_LETTER);
       else
         state_letter = state;
+
       for (i = 0; i < NOTCHAR; ++i)
         trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
       trans[eolbyte] = state_newline;
@@ -2549,8 +2626,8 @@ dfastate (int s, struct dfa *d, int trans[])
       /* Find the union of the follows of the positions of the group.
          This is a hideously inefficient loop.  Fix it someday. */
       for (j = 0; j < grps[i].nelem; ++j)
-        for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
-          insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
+        for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
+          insert(d->follows[grps[i].elems[j]].elems[k], &follows);
 
       if (d->mb_cur_max > 1)
         {
@@ -2592,29 +2669,17 @@ dfastate (int s, struct dfa *d, int trans[])
           insert(d->states[0].elems.elems[j], &follows);
 
       /* Find out if the new state will want any context information. */
-      wants_newline = 0;
-      if (tstbit(eolbyte, labels[i]))
-        for (j = 0; j < follows.nelem; ++j)
-          if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
-            wants_newline = 1;
-
-      wants_letter = 0;
-      for (j = 0; j < CHARCLASS_INTS; ++j)
-        if (labels[i][j] & letters[j])
-          break;
-      if (j < CHARCLASS_INTS)
-        for (j = 0; j < follows.nelem; ++j)
-          if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
-            wants_letter = 1;
+      possible_contexts = charclass_context(labels[i]);
+      separate_contexts = state_separate_contexts(&follows, possible_contexts);
 
       /* Find the state(s) corresponding to the union of the follows. */
-      state = state_index(d, &follows, 0, 0);
-      if (wants_newline)
-        state_newline = state_index(d, &follows, 1, 0);
+      state = state_index(d, &follows, 0);
+      if (separate_contexts & CTX_NEWLINE)
+        state_newline = state_index(d, &follows, CTX_NEWLINE);
       else
         state_newline = state;
-      if (wants_letter)
-        state_letter = state_index(d, &follows, 0, 1);
+      if (separate_contexts & CTX_LETTER)
+        state_letter = state_index(d, &follows, CTX_LETTER);
       else
         state_letter = state;
 
@@ -2674,15 +2739,12 @@ build_state (int s, struct dfa *d)
 
   /* Set up the success bits for this state. */
   d->success[s] = 0;
-  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
-      s, *d))
-    d->success[s] |= 4;
-  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
-      s, *d))
-    d->success[s] |= 2;
-  if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
-      s, *d))
-    d->success[s] |= 1;
+  if (ACCEPTS_IN_CONTEXT(d->states[s].context, CTX_NEWLINE, s, *d))
+    d->success[s] |= CTX_NEWLINE;
+  if (ACCEPTS_IN_CONTEXT(d->states[s].context, CTX_LETTER, s, *d))
+    d->success[s] |= CTX_LETTER;
+  if (ACCEPTS_IN_CONTEXT(d->states[s].context, CTX_NONE, s, *d))
+    d->success[s] |= CTX_NONE;
 
   MALLOC(trans, NOTCHAR);
   dfastate(s, d, trans);
@@ -2842,33 +2904,27 @@ transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
 static int
 match_anychar (struct dfa *d, int s, position pos, int idx)
 {
-  int newline = 0;
-  int letter = 0;
+  int context;
   wchar_t wc;
   int mbclen;
 
   wc = inputwcs[idx];
   mbclen = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
 
-  /* Check context.  */
+  /* Check syntax bits.  */
   if (wc == (wchar_t)eolbyte)
     {
       if (!(syntax_bits & RE_DOT_NEWLINE))
         return 0;
-      newline = 1;
     }
   else if (wc == (wchar_t)'\0')
     {
       if (syntax_bits & RE_DOT_NOT_NULL)
         return 0;
-      newline = 1;
     }
 
-  if (iswalnum(wc) || wc == L'_')
-    letter = 1;
-
-  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
-                           newline, d->states[s].letter, letter))
+  context = wchar_context(wc);
+  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].context, context))
     return 0;
 
   return mbclen;
@@ -2891,29 +2947,25 @@ match_mb_charset (struct dfa *d, int s, position pos, int idx)
   /* Pointer to the structure to which we are currently refering.  */
   struct mb_char_classes *work_mbc;
 
-  int newline = 0;
-  int letter = 0;
+  int context;
   wchar_t wc;		/* Current refering character.  */
 
   wc = inputwcs[idx];
 
-  /* Check context.  */
+  /* Check syntax bits.  */
   if (wc == (wchar_t)eolbyte)
     {
       if (!(syntax_bits & RE_DOT_NEWLINE))
         return 0;
-      newline = 1;
     }
   else if (wc == (wchar_t)'\0')
     {
       if (syntax_bits & RE_DOT_NOT_NULL)
         return 0;
-      newline = 1;
     }
-  if (iswalnum(wc) || wc == L'_')
-    letter = 1;
-  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
-                           newline, d->states[s].letter, letter))
+
+  context = wchar_context(wc);
+  if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].context, context))
     return 0;
 
   /* Assign the current refering operator to work_mbc.  */
@@ -3117,8 +3169,7 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
     }
 
   /* This state has some operators which can match a multibyte character.  */
-  follows.nelem = 0;
-  MALLOC(follows.elems, d->nleaves);
+  alloc_position_set(&follows, d->nleaves);
 
   /* `maxlen' may be longer than the length of a character, because it may
      not be a character but a (multi character) collating element.
@@ -3127,12 +3178,11 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
   transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
 
   wc = inputwcs[*pp - mbclen - buf_begin];
-  s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
+  s1 = state_index(d, &follows, wchar_context (wc));
   realloc_trans_if_necessary(d, s1);
 
   while (*pp - p1 < maxlen)
     {
-      follows.nelem = 0;
       transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
 
       for (i = 0; i < nelem ; i++)
@@ -3145,7 +3195,7 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
         }
 
       wc = inputwcs[*pp - mbclen - buf_begin];
-      s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
+      s1 = state_index(d, &follows, wchar_context (wc));
       realloc_trans_if_necessary(d, s1);
     }
   free(match_lens);
@@ -3210,14 +3260,14 @@ prepare_wc_buf (const char *begin, const char *end)
    points to the beginning of the buffer, and END points to the first byte
    after its end.  Note however that we store a sentinel byte (usually
    newline) in *END, so the actual buffer must be one byte longer.
-   When NEWLINE is nonzero, newlines may appear in the matching string.
+   When ALLOW_NL is nonzero, newlines may appear in the matching string.
    If COUNT is non-NULL, increment *COUNT once for each newline processed.
    Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
    encountered a back-reference (1) or not (0).  The caller may use this
    to decide whether to fall back on a backtracking matcher. */
 char *
 dfaexec (struct dfa *d, char const *begin, char *end,
-         int newline, int *count, int *backref)
+         int allow_nl, int *count, int *backref)
 {
   int s, s1;		/* Current state. */
   unsigned char const *p; /* Current input character. */
@@ -3225,18 +3275,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
                                    into a register. */
   unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
   unsigned char saved_end;
-  static int sbit[NOTCHAR];	/* Table for anding with d->success. */
-  static int sbit_init;
-
-  if (! sbit_init)
-    {
-      unsigned int i;
-
-      sbit_init = 1;
-      for (i = 0; i < NOTCHAR; ++i)
-        sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
-      sbit[eol] = 4;
-    }
 
   if (! d->tralloc)
     build_state_zero(d);
@@ -3360,7 +3398,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
           continue;
         }
 
-      if (p[-1] == eol && newline)
+      if (p[-1] == eol && allow_nl)
         {
           s = d->newlines[s1];
           continue;
@@ -3666,7 +3704,7 @@ enlist (char **cpp, char *new, size_t len)
         cpp[i] = NULL;
       }
   /* Add the new string. */
-  cpp = xnrealloc(cpp, i + 2, sizeof *cpp);
+  REALLOC(cpp, i + 2);
   cpp[i] = new;
   cpp[i + 1] = NULL;
   return cpp;
@@ -3799,7 +3837,7 @@ dfamust (struct dfa *d)
 
   result = empty_string;
   exact = 0;
-  musts = xnmalloc(d->tindex + 1, sizeof *musts);
+  MALLOC (musts, d->tindex + 1);
   mp = musts;
   for (i = 0; i <= d->tindex; ++i)
     mp[i] = must0;
author	Arnold D. Robbins <arnold@skeeve.com>	2012-03-22 22:10:13 +0200
committer	Arnold D. Robbins <arnold@skeeve.com>	2012-03-22 22:10:13 +0200
commit	c7f51459f7da3e0c693f8a950726869d99e5bcf2 (patch)
tree	2e4b54cc695adc5b4fdaec56e237250c030a5419 /dfa.c
parent	1e495ca269aa2653bf804088ebe532b67110a3ef (diff)
parent	b467a6d3d604723e0c152dceb09e998c059bfa40 (diff)
download	egawk-c7f51459f7da3e0c693f8a950726869d99e5bcf2.tar.gz egawk-c7f51459f7da3e0c693f8a950726869d99e5bcf2.tar.bz2 egawk-c7f51459f7da3e0c693f8a950726869d99e5bcf2.zip