Merge branch 'master' into comment

author: Arnold D. Robbins <arnold@skeeve.com> 2014-04-11 07:44:22 +0300
committer: Arnold D. Robbins <arnold@skeeve.com> 2014-04-11 07:44:22 +0300
commit: ebb6772e9eabeb81e3cc9305a6bec7adf7aad450 (patch)
tree: 2cf743f82791db19cc7e31cab86b1fc9a4d5ddbb /dfa.c
parent: e069c636968370f0899d5e4ebaeb9c2341804245 (diff)
parent: a4b59faf911743b30f2e6e979c4f9c1ea0669ac3 (diff)
download: egawk-ebb6772e9eabeb81e3cc9305a6bec7adf7aad450.tar.gz
egawk-ebb6772e9eabeb81e3cc9305a6bec7adf7aad450.tar.bz2
egawk-ebb6772e9eabeb81e3cc9305a6bec7adf7aad450.zip
1 files changed, 309 insertions, 228 deletions
diff --git a/dfa.c b/dfa.c
index 19ca737f..378305df 100644
--- a/dfa.c
+++ b/dfa.c
@@ -43,8 +43,15 @@
 #include "missing_d/gawkbool.h"
 #endif /* HAVE_STDBOOL_H */
 
-#include "dfa.h"
-
+/* Gawk doesn't use Gnulib, so don't assume that setlocale and
+   static_assert are present.  */
+#ifndef LC_ALL
+# define setlocale(category, locale) NULL
+#endif
+#ifndef static_assert
+# define static_assert(cond, diagnostic) \
+    extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
+#endif
 
 #define STREQ(a, b) (strcmp (a, b) == 0)
 
@@ -84,6 +91,8 @@
 
 #include "xalloc.h"
 
+#include "dfa.h"
+
 #ifdef GAWK
 static int
 is_blank (int c)
@@ -130,7 +139,7 @@ typedef unsigned int charclass[CHARCLASS_INTS];
 /* Convert a possibly-signed character to an unsigned character.  This is
    a bit safer than casting to unsigned char, since it catches some type
    errors that the cast doesn't.  */
-static inline unsigned char
+static unsigned char
 to_uchar (char ch)
 {
   return ch;
@@ -219,7 +228,8 @@ enum
   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
                                    the empty string.  */
 
-  BACKREF,                      /* BACKREF is generated by \<digit>; it
+  BACKREF,                      /* BACKREF is generated by \<digit>
+                                   or by any other construct that
                                    is not completely handled.  If the scanner
                                    detects a transition on backref, it returns
                                    a kind of "semi-success" indicating that
@@ -402,6 +412,14 @@ struct dfa
   size_t nmultibyte_prop;
   int *multibyte_prop;
 
+#if MBS_SUPPORT
+  /* A table indexed by byte values that contains the corresponding wide
+     character (if any) for that byte.  WEOF means the byte is the
+     leading byte of a multibyte character.  Invalid and null bytes are
+     mapped to themselves.  */
+  wint_t mbrtowc_cache[NOTCHAR];
+#endif
+
   /* Array of the bracket expression in the DFA.  */
   struct mb_char_classes *mbcsets;
   size_t nmbcsets;
@@ -504,6 +522,64 @@ static void regexp (void);
     }								\
   while (false)
 
+static void
+dfambcache (struct dfa *d)
+{
+#if MBS_SUPPORT
+  int i;
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t s = { 0 };
+      wchar_t wc;
+      wint_t wi;
+      switch (mbrtowc (&wc, &c, 1, &s))
+        {
+        default: wi = wc; break;
+        case (size_t) -2: wi = WEOF; break;
+        case (size_t) -1: wi = uc; break;
+        }
+      d->mbrtowc_cache[uc] = wi;
+    }
+#endif
+}
+
+#if MBS_SUPPORT
+/* Given the dfa D, store into *PWC the result of converting the
+   leading bytes of the multibyte buffer S of length N bytes, updating
+   the conversion state in *MBS.  On conversion error, convert just a
+   single byte as-is.  Return the number of bytes converted.
+
+   This differs from mbrtowc (PWC, S, N, MBS) as follows:
+
+   * Extra arg D, containing an mbrtowc_cache for speed.
+   * N must be at least 1.
+   * S[N - 1] must be a sentinel byte.
+   * Shift encodings are not supported.
+   * The return value is always in the range 1..N.
+   * *MBS is always valid afterwards.
+   * *PWC is always set to something.  */
+static size_t
+mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n,
+              mbstate_t *mbs)
+{
+  unsigned char uc = s[0];
+  wint_t wc = d->mbrtowc_cache[uc];
+
+  if (wc == WEOF)
+    {
+      size_t nbytes = mbrtowc (pwc, s, n, mbs);
+      if (0 < nbytes && nbytes < (size_t) -2)
+        return nbytes;
+      memset (mbs, 0, sizeof *mbs);
+      wc = uc;
+    }
+
+  *pwc = wc;
+  return 1;
+}
+#endif
 
 #ifdef DEBUG
 
@@ -731,67 +807,39 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
    this may happen when folding case in weird Turkish locales where
    dotless i/dotted I are not included in the chosen character set.
    Return whether a bit was set in the charclass.  */
-#if MBS_SUPPORT
 static bool
 setbit_wc (wint_t wc, charclass c)
 {
+#if MBS_SUPPORT
   int b = wctob (wc);
   if (b == EOF)
     return false;
 
   setbit (b, c);
   return true;
-}
-
-/* Set a bit in the charclass for the given single byte character,
-   if it is valid in the current character set.  */
-static void
-setbit_c (int b, charclass c)
-{
-  /* Do nothing if b is invalid in this character set.  */
-  if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
-    return;
-  setbit (b, c);
-}
 #else
-# define setbit_c setbit
-static inline bool
-setbit_wc (wint_t wc, charclass c)
-{
   abort ();
    /*NOTREACHED*/ return false;
-}
 #endif
+}
 
-/* Like setbit_c, but if case is folded, set both cases of a letter.  For
-   MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
-   and the caller takes care of setting the appropriate field of struct
-   mb_char_classes.  */
+/* Set a bit for B and its case variants in the charclass C.
+   MB_CUR_MAX must be 1.  */
 static void
 setbit_case_fold_c (int b, charclass c)
 {
-  if (MB_CUR_MAX > 1)
-    {
-      wint_t wc = btowc (b);
-      if (wc == WEOF)
-        return;
-      setbit (b, c);
-      if (case_fold && iswalpha (wc))
-        setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
-    }
-  else
-    {
-      setbit (b, c);
-      if (case_fold && isalpha (b))
-        setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
-    }
+  int ub = toupper (b);
+  int i;
+  for (i = 0; i < NOTCHAR; i++)
+    if (toupper (i) == ub)
+      setbit (i, c);
 }
 
 
 
 /* UTF-8 encoding allows some optimizations that we can't otherwise
    assume in a multibyte encoding.  */
-static inline int
+int
 using_utf8 (void)
 {
   static int utf8 = -1;
@@ -811,6 +859,46 @@ using_utf8 (void)
   return utf8;
 }
 
+/* Return true if the current locale is known to be a unibyte locale
+   without multicharacter collating sequences and where range
+   comparisons simply use the native encoding.  These locales can be
+   processed more efficiently.  */
+
+static bool
+using_simple_locale (void)
+{
+  /* True if the native character set is known to be compatible with
+     the C locale.  The following test isn't perfect, but it's good
+     enough in practice, as only ASCII and EBCDIC are in common use
+     and this test correctly accepts ASCII and rejects EBCDIC.  */
+  enum { native_c_charset =
+    ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
+     && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
+     && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
+     && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
+     && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
+     && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
+     && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
+     && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
+     && '}' == 125 && '~' == 126)
+  };
+
+  if (! native_c_charset || MB_CUR_MAX > 1)
+    return false;
+  else
+    {
+      static int unibyte_c = -1;
+      if (unibyte_c < 0)
+        {
+          char const *locale = setlocale (LC_ALL, NULL);
+          unibyte_c = (!locale
+                       || STREQ (locale, "C")
+                       || STREQ (locale, "POSIX"));
+        }
+      return unibyte_c;
+    }
+}
+
 /* Lexical analyzer.  All the dross that deals with the obnoxious
    GNU Regex syntax bits is located here.  The poor, suffering
    reader is referred to the GNU Regex documentation for the
@@ -827,7 +915,7 @@ static int minrep, maxrep;      /* Repeat counts for {m,n}.  */
 static int cur_mb_len = 1;      /* Length of the multibyte representation of
                                    wctok.  */
 /* These variables are used only if (MB_CUR_MAX > 1).  */
-static mbstate_t mbs;           /* Mbstate for mbrlen.  */
+static mbstate_t mbs;           /* mbstate for mbrtowc.  */
 static wchar_t wctok;           /* Wide character representation of the current
                                    multibyte character.  */
 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.
@@ -864,32 +952,18 @@ static unsigned char const *buf_end;    /* reference to end in dfaexec.  */
     else					\
       {						\
         wchar_t _wc;				\
-        cur_mb_len = mbrtowc (&_wc, lexptr, lexleft, &mbs); \
-        if (cur_mb_len <= 0)			\
-          {					\
-            cur_mb_len = 1;			\
-            --lexleft;				\
-            (wc) = (c) = to_uchar (*lexptr++);  \
-          }					\
-        else					\
-          {					\
-            lexptr += cur_mb_len;		\
-            lexleft -= cur_mb_len;		\
-            (wc) = _wc;				\
-            (c) = wctob (wc);			\
-          }					\
+        size_t nbytes = mbs_to_wchar (dfa, &_wc, lexptr, lexleft, &mbs); \
+        cur_mb_len = nbytes;			\
+        (wc) = _wc;				\
+        (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF;    \
+        lexptr += nbytes;			\
+        lexleft -= nbytes;			\
       }						\
   } while (0)
 
-# define FETCH(c, eoferr)			\
-  do {						\
-    wint_t wc;					\
-    FETCH_WC (c, wc, eoferr);			\
-  } while (0)
-
 #else
 /* Note that characters become unsigned here.  */
-# define FETCH(c, eoferr)	      \
+# define FETCH_WC(c, unused, eoferr)  \
   do {				      \
     if (! lexleft)		      \
       {				      \
@@ -902,14 +976,56 @@ static unsigned char const *buf_end;    /* reference to end in dfaexec.  */
     --lexleft;			      \
   } while (0)
 
-# define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
-
 #endif /* MBS_SUPPORT */
 
 #ifndef MIN
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
 
+/* The set of wchar_t values C such that there's a useful locale
+   somewhere where C != towupper (C) && C != towlower (towupper (C)).
+   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
+static short const lonesome_lower[] =
+  {
+    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
+    0x03F2,
+
+    0x03F5, 0x1E9B, 0x1FBE,
+  };
+
+static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2
+                == CASE_FOLDED_BUFSIZE),
+               "CASE_FOLDED_BUFSIZE is wrong");
+
+/* Find the characters equal to C after case-folding, other than C
+   itself, and store them into FOLDED.  Return the number of characters
+   stored.  */
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+  int i;
+  int n = 0;
+  wint_t uc = towupper (c);
+  wint_t lc = towlower (uc);
+  if (uc != c)
+    folded[n++] = uc;
+  if (lc != uc && lc != c && towupper (lc) == uc)
+    folded[n++] = lc;
+  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+    {
+      wint_t li = lonesome_lower[i];
+      if (li != lc && li != uc && li != c && towupper (li) == uc)
+        folded[n++] = li;
+    }
+  return n;
+}
+
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
@@ -928,7 +1044,7 @@ static const struct dfa_ctype prednames[] = {
   {"upper", isupper, false},
   {"lower", islower, false},
   {"digit", isdigit, true},
-  {"xdigit", isxdigit, true},
+  {"xdigit", isxdigit, false},
   {"space", isspace, false},
   {"punct", ispunct, false},
   {"alnum", isalnum, false},
@@ -959,6 +1075,10 @@ parse_bracket_exp (void)
   int c, c1, c2;
   charclass ccl;
 
+  /* True if this is a bracket expression that dfaexec is known to
+     process correctly.  */
+  bool known_bracket_exp = true;
+
   /* Used to warn about [:space:].
      Bit 0 = first character is a colon.
      Bit 1 = last character is a colon.
@@ -1000,6 +1120,7 @@ parse_bracket_exp (void)
     {
       FETCH_WC (c, wc, _("unbalanced ["));
       invert = 1;
+      known_bracket_exp = using_simple_locale ();
     }
   else
     invert = 0;
@@ -1014,16 +1135,14 @@ parse_bracket_exp (void)
          we just treat it as a bunch of ordinary characters.  We can do
          this because we assume regex has checked for syntax errors before
          dfa is ever called.  */
-      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
+      if (c == '[')
         {
 #define MAX_BRACKET_STRING_LEN 32
           char str[MAX_BRACKET_STRING_LEN + 1];
           FETCH_WC (c1, wc1, _("unbalanced ["));
 
-          /* If pattern contains '[[:', '[[.', or '[[='.  */
-          if (c1 == ':'
-              /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1.  */
-              || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
+          if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES))
+              || c1 == '.' || c1 == '=')
             {
               size_t len = 0;
               for (;;)
@@ -1042,7 +1161,10 @@ parse_bracket_exp (void)
               /* Fetch bracket.  */
               FETCH_WC (c, wc, _("unbalanced ["));
               if (c1 == ':')
-                /* build character class.  */
+                /* Build character class.  POSIX allows character
+                   classes to match multicharacter collating elements,
+                   but the regex code does not support that, so do not
+                   worry about that possibility.  */
                 {
                   char const *class
                     = (case_fold && (STREQ (str, "upper")
@@ -1064,30 +1186,11 @@ parse_bracket_exp (void)
 
                   for (c2 = 0; c2 < NOTCHAR; ++c2)
                     if (pred->func (c2))
-                      setbit_case_fold_c (c2, ccl);
+                      setbit (c2, ccl);
                 }
+              else
+                known_bracket_exp = false;
 
-              else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
-                {
-                  char *elem = xmemdup (str, len + 1);
-
-                  if (c1 == '=')
-                    /* build equivalence class.  */
-                    {
-                      REALLOC_IF_NECESSARY (work_mbc->equivs,
-                                            equivs_al, work_mbc->nequivs + 1);
-                      work_mbc->equivs[work_mbc->nequivs++] = elem;
-                    }
-
-                  if (c1 == '.')
-                    /* build collating element.  */
-                    {
-                      REALLOC_IF_NECESSARY (work_mbc->coll_elems,
-                                            coll_elems_al,
-                                            work_mbc->ncoll_elems + 1);
-                      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
-                    }
-                }
               colon_warning_state |= 8;
 
               /* Fetch new lookahead character.  */
@@ -1109,105 +1212,96 @@ parse_bracket_exp (void)
         /* build range characters.  */
         {
           FETCH_WC (c2, wc2, _("unbalanced ["));
-          if (c2 == ']')
+
+          /* A bracket expression like [a-[.aa.]] matches an unknown set.
+             Treat it like [-a[.aa.]] while parsing it, and
+             remember that the set is unknown.  */
+          if (c2 == '[' && *lexptr == '.')
             {
-              /* In the case [x-], the - is an ordinary hyphen,
-                 which is left in c1, the lookahead character.  */
-              lexptr -= cur_mb_len;
-              lexleft += cur_mb_len;
+              known_bracket_exp = false;
+              c2 = ']';
             }
-        }
 
-      if (c1 == '-' && c2 != ']')
-        {
-          if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-            FETCH_WC (c2, wc2, _("unbalanced ["));
-
-          if (MB_CUR_MAX > 1)
+          if (c2 != ']')
             {
-              /* When case folding map a range, say [m-z] (or even [M-z])
-                 to the pair of ranges, [m-z] [M-Z].  */
-              REALLOC_IF_NECESSARY (work_mbc->range_sts,
-                                    range_sts_al, work_mbc->nranges + 1);
-              REALLOC_IF_NECESSARY (work_mbc->range_ends,
-                                    range_ends_al, work_mbc->nranges + 1);
-              work_mbc->range_sts[work_mbc->nranges] =
-                case_fold ? towlower (wc) : (wchar_t) wc;
-              work_mbc->range_ends[work_mbc->nranges++] =
-                case_fold ? towlower (wc2) : (wchar_t) wc2;
-
-              if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+              if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+                FETCH_WC (c2, wc2, _("unbalanced ["));
+
+              if (MB_CUR_MAX > 1)
                 {
+                  /* When case folding map a range, say [m-z] (or even [M-z])
+                     to the pair of ranges, [m-z] [M-Z].  Although this code
+                     is wrong in multiple ways, it's never used in practice.
+                     FIXME: Remove this (and related) unused code.  */
                   REALLOC_IF_NECESSARY (work_mbc->range_sts,
                                         range_sts_al, work_mbc->nranges + 1);
-                  work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
                   REALLOC_IF_NECESSARY (work_mbc->range_ends,
                                         range_ends_al, work_mbc->nranges + 1);
-                  work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
-                }
-            }
-          else
-            {
-#ifdef GAWK
-              c1 = c;
-              if (case_fold)
-                {
-                  c1 = tolower (c1);
-                  c2 = tolower (c2);
+                  work_mbc->range_sts[work_mbc->nranges] =
+                    case_fold ? towlower (wc) : (wchar_t) wc;
+                  work_mbc->range_ends[work_mbc->nranges++] =
+                    case_fold ? towlower (wc2) : (wchar_t) wc2;
+
+                  if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+                    {
+                      REALLOC_IF_NECESSARY (work_mbc->range_sts,
+                                            range_sts_al, work_mbc->nranges + 1);
+                      work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
+                      REALLOC_IF_NECESSARY (work_mbc->range_ends,
+                                            range_ends_al, work_mbc->nranges + 1);
+                      work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+                    }
                 }
-              for (c = c1; c <= c2; c++)
-                setbit_case_fold_c (c, ccl);
-#else
-              /* Defer to the system regex library about the meaning
-                 of range expressions.  */
-              regex_t re;
-              char pattern[6] = { '[', 0, '-', 0, ']', 0 };
-              char subject[2] = { 0, 0 };
-              c1 = c;
-              if (case_fold)
+              else if (using_simple_locale ())
                 {
-                  c1 = tolower (c1);
-                  c2 = tolower (c2);
+                  for (c1 = c; c1 <= c2; c1++)
+                    setbit (c1, ccl);
+                  if (case_fold)
+                    {
+                      int uc = toupper (c);
+                      int uc2 = toupper (c2);
+                      for (c1 = 0; c1 < NOTCHAR; c1++)
+                        {
+                          int uc1 = toupper (c1);
+                          if (uc <= uc1 && uc1 <= uc2)
+                            setbit (c1, ccl);
+                        }
+                    }
                 }
+              else
+                known_bracket_exp = false;
 
-              pattern[1] = c1;
-              pattern[3] = c2;
-              regcomp (&re, pattern, REG_NOSUB);
-              for (c = 0; c < NOTCHAR; ++c)
-                {
-                  if ((case_fold && isupper (c)))
-                    continue;
-                  subject[0] = c;
-                  if (regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
-                    setbit_case_fold_c (c, ccl);
-                }
-              regfree (&re);
-#endif
+              colon_warning_state |= 8;
+              FETCH_WC (c1, wc1, _("unbalanced ["));
+              continue;
             }
 
-          colon_warning_state |= 8;
-          FETCH_WC (c1, wc1, _("unbalanced ["));
-          continue;
+          /* In the case [x-], the - is an ordinary hyphen,
+             which is left in c1, the lookahead character.  */
+          lexptr -= cur_mb_len;
+          lexleft += cur_mb_len;
         }
 
       colon_warning_state |= (c == ':') ? 2 : 4;
 
       if (MB_CUR_MAX == 1)
         {
-          setbit_case_fold_c (c, ccl);
+          if (case_fold)
+            setbit_case_fold_c (c, ccl);
+          else
+            setbit (c, ccl);
           continue;
         }
 
-      if (case_fold && iswalpha (wc))
+      if (case_fold)
         {
-          wc = towlower (wc);
-          if (!setbit_wc (wc, ccl))
-            {
-              REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
-                                    work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = wc;
-            }
-          wc = towupper (wc);
+          wchar_t folded[CASE_FOLDED_BUFSIZE];
+          int i, n = case_folded_counterparts (wc, folded);
+          REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+                                work_mbc->nchars + n);
+          for (i = 0; i < n; i++)
+            if (!setbit_wc (folded[i], ccl))
+              work_mbc->chars[work_mbc->nchars++] = folded[i];
         }
       if (!setbit_wc (wc, ccl))
         {
@@ -1221,6 +1315,9 @@ parse_bracket_exp (void)
   if (colon_warning_state == 7)
     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
 
+  if (! known_bracket_exp)
+    return BACKREF;
+
   if (MB_CUR_MAX > 1)
     {
       static charclass zeroclass;
@@ -1256,14 +1353,9 @@ lex (void)
      "if (backslash) ...".  */
   for (i = 0; i < 2; ++i)
     {
-      if (MB_CUR_MAX > 1)
-        {
-          FETCH_WC (c, wctok, NULL);
-          if ((int) c == EOF)
-            goto normal_char;
-        }
-      else
-        FETCH (c, NULL);
+      FETCH_WC (c, wctok, NULL);
+      if (c == (unsigned int) EOF)
+        goto normal_char;
 
       switch (c)
         {
@@ -1638,10 +1730,11 @@ addtok (token t)
           work_mbc->nchars = 0;
         }
 
-      /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET.  */
+      /* If the MBCSET is non-inverted and doesn't include neither
+         character classes including multibyte characters, range
+         expressions, equivalence classes nor collating elements,
+         it can be replaced to a simple CSET. */
       if (work_mbc->invert
-          || (!using_utf8 () && work_mbc->cset != -1)
-          || work_mbc->nchars != 0
           || work_mbc->nch_classes != 0
           || work_mbc->nranges != 0
           || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0)
@@ -1656,7 +1749,6 @@ addtok (token t)
              that the mbcset is empty now.  Do nothing in that case.  */
           if (work_mbc->cset != -1)
             {
-              assert (using_utf8 ());
               addtok (CSET + work_mbc->cset);
               if (need_or)
                 addtok (OR);
@@ -1680,16 +1772,19 @@ static void
 addtok_wc (wint_t wc)
 {
   unsigned char buf[MB_LEN_MAX];
-  mbstate_t s;
+  mbstate_t s = { 0 };
   int i;
-  memset (&s, 0, sizeof s);
-  cur_mb_len = wcrtomb ((char *) buf, wc, &s);
+  size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
 
-  /* This is merely stop-gap.  When cur_mb_len is 0 or negative,
-     buf[0] is undefined, yet skipping the addtok_mb call altogether
-     can result in heap corruption.  */
-  if (cur_mb_len <= 0)
-    buf[0] = 0;
+  if (stored_bytes != (size_t) -1)
+    cur_mb_len = stored_bytes;
+  else
+    {
+      /* This is merely stop-gap.  buf[0] is undefined, yet skipping
+         the addtok_mb call altogether can corrupt the heap.  */
+      cur_mb_len = 1;
+      buf[0] = 0;
+    }
 
   addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
   for (i = 1; i < cur_mb_len; i++)
@@ -1794,17 +1889,19 @@ add_utf8_anychar (void)
 static void
 atom (void)
 {
-  if (0)
-    {
-      /* empty */
-    }
-  else if (MBS_SUPPORT && tok == WCHAR)
+  if (MBS_SUPPORT && tok == WCHAR)
     {
-      addtok_wc (case_fold ? towlower (wctok) : wctok);
-      if (case_fold && iswalpha (wctok))
+      addtok_wc (wctok);
+
+      if (case_fold)
         {
-          addtok_wc (towupper (wctok));
-          addtok (OR);
+          wchar_t folded[CASE_FOLDED_BUFSIZE];
+          int i, n = case_folded_counterparts (wctok, folded);
+          for (i = 0; i < n; i++)
+            {
+              addtok_wc (folded[i]);
+              addtok (OR);
+            }
         }
 
       tok = lex ();
@@ -3308,43 +3405,26 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
 /* Initialize mblen_buf and inputwcs with data from the next line.  */
 
 static void
-prepare_wc_buf (const char *begin, const char *end)
+prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
 {
 #if MBS_SUPPORT
   unsigned char eol = eolbyte;
-  size_t remain_bytes, i;
+  size_t i;
+  size_t ilim = end - begin + 1;
 
   buf_begin = (unsigned char *) begin;
 
-  remain_bytes = 0;
-  for (i = 0; i < end - begin + 1; i++)
+  for (i = 0; i < ilim; i++)
     {
-      if (remain_bytes == 0)
-        {
-          remain_bytes
-            = mbrtowc (inputwcs + i, begin + i, end - begin - i + 1, &mbs);
-          if (remain_bytes < 1
-              || remain_bytes == (size_t) -1
-              || remain_bytes == (size_t) -2
-              || (remain_bytes == 1 && inputwcs[i] == (wchar_t) begin[i]))
-            {
-              remain_bytes = 0;
-              inputwcs[i] = (wchar_t) begin[i];
-              mblen_buf[i] = 0;
-              if (begin[i] == eol)
-                break;
-            }
-          else
-            {
-              mblen_buf[i] = remain_bytes;
-              remain_bytes--;
-            }
-        }
-      else
+      size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i, &mbs);
+      mblen_buf[i] = nbytes - (nbytes == 1);
+      if (begin[i] == eol)
+        break;
+      while (--nbytes != 0)
         {
-          mblen_buf[i] = remain_bytes;
+          i++;
+          mblen_buf[i] = nbytes;
           inputwcs[i] = 0;
-          remain_bytes--;
         }
     }
 
@@ -3391,7 +3471,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
       MALLOC (mblen_buf, end - begin + 2);
       MALLOC (inputwcs, end - begin + 2);
       memset (&mbs, 0, sizeof (mbstate_t));
-      prepare_wc_buf ((const char *) p, end);
+      prepare_wc_buf (d, (const char *) p, end);
     }
 
   for (;;)
@@ -3481,7 +3561,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
             ++*count;
 
           if (d->mb_cur_max > 1)
-            prepare_wc_buf ((const char *) p, end);
+            prepare_wc_buf (d, (const char *) p, end);
         }
 
       /* Check if we've run off the end of the buffer.  */
@@ -3600,6 +3680,7 @@ void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
 {
   dfainit (d);
+  dfambcache (d);
   dfaparse (s, len, d);
   dfamust (d);
   dfaoptimize (d);
author	Arnold D. Robbins <arnold@skeeve.com>	2014-04-11 07:44:22 +0300
committer	Arnold D. Robbins <arnold@skeeve.com>	2014-04-11 07:44:22 +0300
commit	ebb6772e9eabeb81e3cc9305a6bec7adf7aad450 (patch)
tree	2cf743f82791db19cc7e31cab86b1fc9a4d5ddbb /dfa.c
parent	e069c636968370f0899d5e4ebaeb9c2341804245 (diff)
parent	a4b59faf911743b30f2e6e979c4f9c1ea0669ac3 (diff)
download	egawk-ebb6772e9eabeb81e3cc9305a6bec7adf7aad450.tar.gz egawk-ebb6772e9eabeb81e3cc9305a6bec7adf7aad450.tar.bz2 egawk-ebb6772e9eabeb81e3cc9305a6bec7adf7aad450.zip