11 files changed, 622 insertions, 216 deletions
diff --git a/ChangeLog b/ChangeLog
index 023fa884..d3fcbdd3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,20 @@
-2016-08-29         Aharon Robbins       <aharon.robbins@intel.com>
+2016-09-01         Arnold D. Robbins     <arnold@skeeve.com>
+
+	Merge grep's now thread-safe dfa. Wheee.
+
+	* dfa.h, dfa.c: Sync with grep.
+	* localeinfo.h, localeinfo.c, verify.h: New files.
+	* Makefile.am (base_sources): Adjust.
+	* awk.h (using_utf8): Declare new function.
+	* node.c (str2wstr): Use using_utf8 instead of now-gone dfa function.
+	* re.c: Include "localeinfo.h".
+	(localeinfo): New static variable.
+	(make_regexp): Adjust call to dfa_syntax.
+	(resetup): Call init_localeinfo on localeinfo. Remove call to
+	now-gone function dfa_init.
+	(using_utf8): New function.
+
+2016-08-29         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* configure.ac (fwrite_unlocked): Check for it.
 	* awk.h (fwrite): Define to fwrite_unlocked if we have it.
diff --git a/Makefile.am b/Makefile.am
index dce65018..9acae0bc 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -109,6 +109,8 @@ base_sources = \
 	gettext.h \
 	int_array.c \
 	interpret.h \
+	localeinfo.c \
+	localeinfo.h \
 	io.c \
 	mbsupport.h \
 	main.c \
@@ -126,6 +128,7 @@ base_sources = \
 	replace.c \
 	str_array.c \
 	symbol.c \
+	verify.h \
 	version.c \
 	xalloc.h
 
diff --git a/Makefile.in b/Makefile.in
index 036361cb..f103a420 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -143,10 +143,11 @@ am__objects_1 = array.$(OBJEXT) awkgram.$(OBJEXT) builtin.$(OBJEXT) \
 	dfa.$(OBJEXT) eval.$(OBJEXT) ext.$(OBJEXT) field.$(OBJEXT) \
 	floatcomp.$(OBJEXT) gawkapi.$(OBJEXT) gawkmisc.$(OBJEXT) \
 	getopt.$(OBJEXT) getopt1.$(OBJEXT) int_array.$(OBJEXT) \
-	io.$(OBJEXT) main.$(OBJEXT) mpfr.$(OBJEXT) msg.$(OBJEXT) \
-	node.$(OBJEXT) profile.$(OBJEXT) random.$(OBJEXT) re.$(OBJEXT) \
-	regex.$(OBJEXT) replace.$(OBJEXT) str_array.$(OBJEXT) \
-	symbol.$(OBJEXT) version.$(OBJEXT)
+	localeinfo.$(OBJEXT) io.$(OBJEXT) main.$(OBJEXT) \
+	mpfr.$(OBJEXT) msg.$(OBJEXT) node.$(OBJEXT) profile.$(OBJEXT) \
+	random.$(OBJEXT) re.$(OBJEXT) regex.$(OBJEXT) \
+	replace.$(OBJEXT) str_array.$(OBJEXT) symbol.$(OBJEXT) \
+	version.$(OBJEXT)
 am_gawk_OBJECTS = $(am__objects_1)
 gawk_OBJECTS = $(am_gawk_OBJECTS)
 gawk_LDADD = $(LDADD)
@@ -518,6 +519,8 @@ base_sources = \
 	gettext.h \
 	int_array.c \
 	interpret.h \
+	localeinfo.c \
+	localeinfo.h \
 	io.c \
 	mbsupport.h \
 	main.c \
@@ -535,6 +538,7 @@ base_sources = \
 	replace.c \
 	str_array.c \
 	symbol.c \
+	verify.h \
 	version.c \
 	xalloc.h
 
@@ -681,6 +685,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getopt1.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/int_array.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/io.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/localeinfo.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mpfr.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/msg.Po@am__quote@
diff --git a/awk.h b/awk.h
index edd9cb95..2c401637 100644
--- a/awk.h
+++ b/awk.h
@@ -1656,6 +1656,7 @@ extern void resyntax(int syntax);
 extern void resetup(void);
 extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf);
 extern int get_numbase(const char *str, bool use_locale);
+extern bool using_utf8(void);
 
 /* symbol.c */
 extern void load_symbols();
diff --git a/dfa.c b/dfa.c
index 85cb46ad..fad03e4f 100644
--- a/dfa.c
+++ b/dfa.c
@@ -69,6 +69,8 @@
 
 #include "dfa.h"
 
+#include "localeinfo.h"
+
 #ifdef GAWK
 static int
 is_blank (int c)
@@ -445,14 +447,9 @@ struct dfa
   size_t nregexps;              /* Count of parallel regexps being built
                                    with dfaparse.  */
   bool fast;			/* The DFA is fast.  */
-  bool multibyte;		/* MB_CUR_MAX > 1.  */
   token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
   mbstate_t mbs;		/* Multibyte conversion state.  */
 
-  /* dfaexec implementation.  */
-  char *(*dfaexec) (struct dfa *, char const *, char *,
-                    bool, size_t *, bool *);
-
   /* The following are valid only if MB_CUR_MAX > 1.  */
 
   /* The value of multibyte_prop[i] is defined by following rule.
@@ -538,6 +535,21 @@ struct dfa
   state_num **mb_trans;      /* Transition tables for states with ANYCHAR.  */
   state_num mb_trcount;         /* Number of transition tables for states with
                                    ANYCHAR that have actually been built.  */
+
+  /* Information derived from the locale.  This is at the end so that
+     a quick memset need not clear it specially.  */
+
+  /* dfaexec implementation.  */
+  char *(*dfaexec) (struct dfa *, char const *, char *,
+                    bool, size_t *, bool *);
+
+  /* The locale is simple, like the C locale.  These locales can be
+     processed more efficiently, e.g., the relationship between lower-
+     and upper-case letters is 1-1.  */
+  bool simple_locale;
+
+  /* Other cached information derived from the locale.  */
+  struct localeinfo localeinfo;
 };
 
 /* Some macros for user access to dfa internals.  */
@@ -551,13 +563,8 @@ struct dfa
 
 static void regexp (struct dfa *dfa);
 
-/* A table indexed by byte values that contains the corresponding wide
-   character (if any) for that byte.  WEOF means the byte is not a
-   valid single-byte character.  */
-static wint_t mbrtowc_cache[NOTCHAR];
-
 /* Store into *PWC the result of converting the leading bytes of the
-   multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+   multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
    and updating the conversion state in *D.  On conversion error,
    convert just a single byte, to WEOF.  Return the number of bytes
    converted.
@@ -566,7 +573,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
 
    * PWC points to wint_t, not to wchar_t.
    * The last arg is a dfa *D instead of merely a multibyte conversion
-     state D->mbs.  D also contains an mbrtowc_cache for speed.
+     state D->mbs.
    * N must be at least 1.
    * S[N - 1] must be a sentinel byte.
    * Shift encodings are not supported.
@@ -577,7 +584,7 @@ static size_t
 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 {
   unsigned char uc = s[0];
-  wint_t wc = mbrtowc_cache[uc];
+  wint_t wc = d->localeinfo.sbctowc[uc];
 
   if (wc == WEOF)
     {
@@ -754,7 +761,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
 
 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 static size_t
-dfa_charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass const s)
 {
   size_t i;
 
@@ -769,9 +776,9 @@ dfa_charclass_index (struct dfa *d, charclass const s)
 }
 
 static bool
-unibyte_word_constituent (unsigned char c)
+unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 {
-  return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+  return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 }
 
 static int
@@ -779,68 +786,11 @@ char_context (struct dfa const *dfa, unsigned char c)
 {
   if (c == dfa->syntax.eolbyte)
     return CTX_NEWLINE;
-  if (unibyte_word_constituent (c))
+  if (unibyte_word_constituent (dfa, c))
     return CTX_LETTER;
   return CTX_NONE;
 }
 
-/* UTF-8 encoding allows some optimizations that we can't otherwise
-   assume in a multibyte encoding.  */
-static bool using_utf8;
-
-bool
-dfa_using_utf8 (void)
-{
-  return using_utf8;
-}
-
-static void
-init_mbrtowc_cache (void)
-{
-  int i;
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      char c = i;
-      unsigned char uc = i;
-      mbstate_t s = { 0 };
-      wchar_t wc;
-      mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
-    }
-}
-
-/* Entry point to set syntax options.  */
-void
-dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
-{
-  int i;
-  dfa->syntax.syntax_bits_set = true;
-  dfa->syntax.syntax_bits = bits;
-  dfa->syntax.case_fold = fold;
-  dfa->syntax.eolbyte = eol;
-
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      unsigned char uc = i;
-
-      /* Use mbrtowc_cache to calculate sbit.  */
-      dfa->syntax.sbit[uc] = char_context (dfa, uc);
-      switch (dfa->syntax.sbit[uc])
-        {
-        case CTX_LETTER:
-          setbit (uc, dfa->syntax.letters);
-          break;
-        case CTX_NEWLINE:
-          setbit (uc, dfa->syntax.newline);
-          break;
-        }
-
-      /* POSIX requires that the five bytes in "\n\r./" (including the
-         terminating NUL) cannot occur inside a multibyte character.  */
-      dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
-                                     : strchr ("\n\r./", uc) != NULL);
-    }
-}
-
 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
    this may happen when folding case in weird Turkish locales where
@@ -869,30 +819,10 @@ setbit_case_fold_c (int b, charclass c)
       setbit (i, c);
 }
 
-static void check_utf8 (void)
-{
-  wchar_t wc;
-  mbstate_t mbs = { 0 };
-  using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-}
-
-static bool unibyte_c;
-
-static void check_unibyte_c (void)
-{
-  char const *locale = setlocale (LC_ALL, NULL);
-  unibyte_c = (!locale
-               || STREQ (locale, "C")
-               || STREQ (locale, "POSIX"));
-}
-
-/* The current locale is known to be a unibyte locale
-   without multicharacter collating sequences and where range
-   comparisons simply use the native encoding.  These locales can be
-   processed more efficiently.  */
+/* Return true if the locale compatible with the C locale.  */
 
 static bool
-using_simple_locale (struct dfa const *dfa)
+using_simple_locale (bool multibyte)
 {
   /* The native character set is known to be compatible with
      the C locale.  The following test isn't perfect, but it's good
@@ -910,7 +840,15 @@ using_simple_locale (struct dfa const *dfa)
      && '}' == 125 && '~' == 126)
   };
 
-  return (native_c_charset & !dfa->multibyte) | unibyte_c;
+  if (native_c_charset && !multibyte)
+    return true;
+  else
+    {
+      /* Treat C and POSIX locales as being compatible.  Also, treat
+         errors as compatible, as these are invariably from stubs.  */
+      char const *loc = setlocale (LC_ALL, NULL);
+      return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX");
+    }
 }
 
 /* Fetch the next lexical input character.  Set C (of type int) to the
@@ -946,53 +884,6 @@ using_simple_locale (struct dfa const *dfa)
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
 
-/* The set of wchar_t values C such that there's a useful locale
-   somewhere where C != towupper (C) && C != towlower (towupper (C)).
-   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
-   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
-   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
-static short const lonesome_lower[] =
-  {
-    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
-    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
-    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
-       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
-    0x03F2,
-
-    0x03F5, 0x1E9B, 0x1FBE,
-  };
-
-/* Maximum number of characters that can be the case-folded
-   counterparts of a single character, not counting the character
-   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
-   entry in LONESOME_LOWER.  */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
-   itself, and store them into FOLDED.  Return the number of characters
-   stored.  */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
-  unsigned int i;
-  unsigned int n = 0;
-  wint_t uc = towupper (c);
-  wint_t lc = towlower (uc);
-  if (uc != c)
-    folded[n++] = uc;
-  if (lc != uc && lc != c && towupper (lc) == uc)
-    folded[n++] = lc;
-  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
-    {
-      wint_t li = lonesome_lower[i];
-      if (li != lc && li != uc && li != c && towupper (li) == uc)
-        folded[n++] = li;
-    }
-  return n;
-}
-
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
@@ -1061,7 +952,7 @@ parse_bracket_exp (struct dfa *dfa)
   size_t chars_al;
 
   chars_al = 0;
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     {
       dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
                                     &dfa->mbcsets_alloc,
@@ -1084,7 +975,7 @@ parse_bracket_exp (struct dfa *dfa)
     {
       FETCH_WC (dfa, c, wc, _("unbalanced ["));
       invert = true;
-      known_bracket_exp = using_simple_locale (dfa);
+      known_bracket_exp = dfa->simple_locale;
     }
   else
     invert = false;
@@ -1139,7 +1030,7 @@ parse_bracket_exp (struct dfa *dfa)
                   if (!pred)
                     dfaerror (_("invalid character class"));
 
-                  if (dfa->multibyte && !pred->single_byte_only)
+                  if (dfa->localeinfo.multibyte && !pred->single_byte_only)
                     known_bracket_exp = false;
                   else
                     for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1199,9 +1090,9 @@ parse_bracket_exp (struct dfa *dfa)
               /* Treat [x-y] as a range if x != y.  */
               if (wc != wc2 || wc == WEOF)
                 {
-                  if (dfa->multibyte)
+                  if (dfa->localeinfo.multibyte)
                     known_bracket_exp = false;
-                  else if (using_simple_locale (dfa))
+                  else if (dfa->simple_locale)
                     {
                       int ci;
                       for (ci = c; ci <= c2; ci++)
@@ -1228,7 +1119,7 @@ parse_bracket_exp (struct dfa *dfa)
 
       colon_warning_state |= (c == ':') ? 2 : 4;
 
-      if (!dfa->multibyte)
+      if (!dfa->localeinfo.multibyte)
         {
           if (dfa->syntax.case_fold)
             setbit_case_fold_c (c, ccl);
@@ -1265,22 +1156,22 @@ parse_bracket_exp (struct dfa *dfa)
   if (! known_bracket_exp)
     return BACKREF;
 
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     {
       work_mbc->invert = invert;
-      work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
+      work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
       return MBCSET;
     }
 
   if (invert)
     {
-      assert (!dfa->multibyte);
+      assert (!dfa->localeinfo.multibyte);
       notset (ccl);
       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
         clrbit ('\n', ccl);
     }
 
-  return CSET + dfa_charclass_index (dfa, ccl);
+  return CSET + charclass_index (dfa, ccl);
 }
 
 struct lexptr
@@ -1535,7 +1426,7 @@ lex (struct dfa *dfa)
         case '.':
           if (backslash)
             goto normal_char;
-          if (dfa->multibyte)
+          if (dfa->localeinfo.multibyte)
             {
               /* In multibyte environment period must match with a single
                  character not a byte.  So we use ANYCHAR.  */
@@ -1549,13 +1440,13 @@ lex (struct dfa *dfa)
           if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
             clrbit ('\0', ccl);
           dfa->lex.laststart = false;
-          return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+          return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
 
         case 's':
         case 'S':
           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
-          if (!dfa->multibyte)
+          if (!dfa->localeinfo.multibyte)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1564,7 +1455,7 @@ lex (struct dfa *dfa)
               if (c == 'S')
                 notset (ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1588,16 +1479,16 @@ lex (struct dfa *dfa)
           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
 
-          if (!dfa->multibyte)
+          if (!dfa->localeinfo.multibyte)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
-                if (unibyte_word_constituent (c2))
+                if (unibyte_word_constituent (dfa, c2))
                   setbit (c2, ccl);
               if (c == 'W')
                 notset (ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1627,14 +1518,14 @@ lex (struct dfa *dfa)
           dfa->lex.laststart = false;
           /* For multibyte character sets, folding is done in atom.  Always
              return WCHAR.  */
-          if (dfa->multibyte)
+          if (dfa->localeinfo.multibyte)
             return dfa->lex.lasttok = WCHAR;
 
           if (dfa->syntax.case_fold && isalpha (c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           return dfa->lex.lasttok = c;
@@ -1654,11 +1545,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
     {
       dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
                                 sizeof *dfa->tokens);
-      if (dfa->multibyte)
+      if (dfa->localeinfo.multibyte)
         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
                                          sizeof *dfa->multibyte_prop);
     }
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     dfa->multibyte_prop[dfa->tindex] = mbprop;
   dfa->tokens[dfa->tindex++] = t;
 
@@ -1695,7 +1586,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc);
 static void
 addtok (struct dfa *dfa, token t)
 {
-  if (dfa->multibyte && t == MBCSET)
+  if (dfa->localeinfo.multibyte && t == MBCSET)
     {
       bool need_or = false;
       struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1794,7 +1685,7 @@ add_utf8_anychar (struct dfa *dfa)
             if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
               clrbit ('\0', c);
           }
-        dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
+        dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
       }
 
   /* A valid UTF-8 character is
@@ -1878,7 +1769,7 @@ atom (struct dfa *dfa)
 
       dfa->parse.tok = lex (dfa);
     }
-  else if (dfa->parse.tok == ANYCHAR && using_utf8)
+  else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
     {
       /* For UTF-8 expand the period to a series of CSETs that define a valid
          UTF-8 character.  This avoids using the slow multibyte path.  I'm
@@ -1939,7 +1830,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
 {
   size_t i;
 
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     for (i = 0; i < ntokens; ++i)
       addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
   else
@@ -2025,7 +1916,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   d->lex.lasttok = END;
   d->lex.laststart = true;
   d->lex.parens = 0;
-  if (d->multibyte)
+  if (d->localeinfo.multibyte)
     {
       d->lex.cur_mb_len = 0;
       memset (&d->mbs, 0, sizeof d->mbs);
@@ -2214,7 +2105,7 @@ state_index (struct dfa *d, position_set const *s, int context)
         }
       else if (d->tokens[s->elems[j].index] == BACKREF)
         constraint = NO_CONSTRAINT;
-      if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+      if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
         {
           int acceptable
             = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2691,7 +2582,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+      else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
         {
           /* ANYCHAR must match a single character, so put it to
              D->states[s].mbps which contains the positions which can
@@ -2837,7 +2728,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         state_letter = state;
 
       for (i = 0; i < NOTCHAR; ++i)
-        trans[i] = unibyte_word_constituent (i) ? state_letter : state;
+        trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
       trans[d->syntax.eolbyte] = state_newline;
     }
   else
@@ -2854,7 +2745,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
           insert (d->follows[grps[i].elems[j]].elems[k], &follows);
 
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           /* If a token in follows.elems is not 1st byte of a multibyte
              character, or the states of follows must accept the bytes
@@ -2887,7 +2778,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
       /* If we are building a searching matcher, throw in the positions
          of state 0 as well.  */
-      if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
+      if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
         {
           merge (&d->states[0].elems, &follows, &tmp);
           copy (&tmp, &follows);
@@ -2943,7 +2834,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
               if (c == d->syntax.eolbyte)
                 trans[c] = state_newline;
-              else if (unibyte_word_constituent (c))
+              else if (unibyte_word_constituent (d, c))
                 trans[c] = state_letter;
               else if (c < NOTCHAR)
                 trans[c] = state;
@@ -2984,7 +2875,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
@@ -2996,7 +2887,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
         {
           d->trans[oldalloc] = NULL;
           d->fails[oldalloc] = NULL;
-          if (d->multibyte)
+          if (d->localeinfo.multibyte)
             d->mb_trans[oldalloc] = NULL;
         }
     }
@@ -3030,7 +2921,7 @@ build_state (state_num s, struct dfa *d)
         }
       d->trcount = d->min_trcount;
 
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           for (i = d->min_trcount; i < d->tralloc; i++)
             {
@@ -3481,7 +3372,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end,
   return (char *) begin;
 }
 
-/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte),
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
    but faster and set *BACKREF if the DFA code does not support this
    regexp usage.  */
 
@@ -3539,7 +3430,7 @@ dfa_supported (struct dfa const *d)
         case ENDWORD:
         case LIMWORD:
         case NOTLIMWORD:
-          if (!d->multibyte)
+          if (!d->localeinfo.multibyte)
             continue;
           /* fallthrough */
 
@@ -3557,7 +3448,7 @@ dfaoptimize (struct dfa *d)
   size_t i;
   bool have_backref = false;
 
-  if (!using_utf8)
+  if (!d->localeinfo.using_utf8)
     return;
 
   for (i = 0; i < d->tindex; ++i)
@@ -3587,7 +3478,7 @@ dfaoptimize (struct dfa *d)
     }
 
   free_mbdata (d);
-  d->multibyte = false;
+  d->localeinfo.multibyte = false;
   d->dfaexec = dfaexec_sb;
   d->fast = true;
 }
@@ -3602,7 +3493,7 @@ dfassbuild (struct dfa *d)
   struct dfa *sup = dfaalloc ();
 
   *sup = *d;
-  sup->multibyte = false;
+  sup->localeinfo.multibyte = false;
   sup->dfaexec = dfaexec_sb;
   sup->multibyte_prop = NULL;
   sup->mbcsets = NULL;
@@ -3635,7 +3526,7 @@ dfassbuild (struct dfa *d)
         case BACKREF:
           zeroset (ccl);
           notset (ccl);
-          sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+          sup->tokens[j++] = CSET + charclass_index (sup, ccl);
           sup->tokens[j++] = STAR;
           if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
               || d->tokens[i + 1] == PLUS)
@@ -3646,7 +3537,7 @@ dfassbuild (struct dfa *d)
         case ENDWORD:
         case LIMWORD:
         case NOTLIMWORD:
-          if (d->multibyte)
+          if (d->localeinfo.multibyte)
             {
               /* These constraints aren't supported in a multibyte locale.
                  Ignore them in the superset DFA.  */
@@ -3663,7 +3554,7 @@ dfassbuild (struct dfa *d)
     }
   sup->tindex = j;
 
-  if (have_nchar && (have_achar || d->multibyte))
+  if (have_nchar && (have_achar || d->localeinfo.multibyte))
     d->superset = sup;
   else
     {
@@ -3705,7 +3596,7 @@ dfafree (struct dfa *d)
   free (d->charclasses);
   free (d->tokens);
 
-  if (d->multibyte)
+  if (d->localeinfo.multibyte)
     free_mbdata (d);
 
   for (i = 0; i < d->sindex; ++i)
@@ -4227,20 +4118,49 @@ dfamustfree (struct dfamust *dm)
 struct dfa *
 dfaalloc (void)
 {
-  struct dfa *d = xzalloc (sizeof *d);
-  d->multibyte = MB_CUR_MAX > 1;
-  d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
-  d->fast = !d->multibyte;
-  d->lex.cur_mb_len = 1;
-  return d;
+  return xmalloc (sizeof (struct dfa));
 }
 
+/* Initialize DFA.  */
 void
-dfa_init (void)
+dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
+           reg_syntax_t bits, bool fold, unsigned char eol)
 {
-  check_utf8 ();
-  check_unibyte_c ();
-  init_mbrtowc_cache ();
+  int i;
+  memset (dfa, 0, offsetof (struct dfa, dfaexec));
+  dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
+  dfa->simple_locale = using_simple_locale (linfo->multibyte);
+  dfa->localeinfo = *linfo;
+
+  dfa->fast = !dfa->localeinfo.multibyte;
+
+  dfa->lex.cur_mb_len = 1;
+  dfa->syntax.syntax_bits_set = true;
+  dfa->syntax.syntax_bits = bits;
+  dfa->syntax.case_fold = fold;
+  dfa->syntax.eolbyte = eol;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      unsigned char uc = i;
+
+      dfa->syntax.sbit[uc] = char_context (dfa, uc);
+      switch (dfa->syntax.sbit[uc])
+        {
+        case CTX_LETTER:
+          setbit (uc, dfa->syntax.letters);
+          break;
+        case CTX_NEWLINE:
+          setbit (uc, dfa->syntax.newline);
+          break;
+        }
+
+      /* POSIX requires that the five bytes in "\n\r./" (including the
+         terminating NUL) cannot occur inside a multibyte character.  */
+      dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
+                                     ? (uc & 0xc0) != 0x80
+                                     : strchr ("\n\r./", uc) != NULL);
+    }
 }
 
 /* vim:set shiftwidth=2: */
diff --git a/dfa.h b/dfa.h
index 02f56f44..1fd37ec9 100644
--- a/dfa.h
+++ b/dfa.h
@@ -28,6 +28,8 @@
 
 #define _GL_ATTRIBUTE_MALLOC
 
+struct localeinfo; /* See localeinfo.h.  */
+
 /* Element of a list of strings, at least one of which is known to
    appear in any R.E. matching the DFA. */
 struct dfamust
@@ -48,17 +50,22 @@ struct dfa;
    calling dfafree() on it. */
 extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC;
 
+/* Initialize or reinitialize a DFA.  This must be called before
+   any of the routines below.  The arguments are:
+   1. The DFA to operate on.
+   2. Information about the current locale.
+   3. The syntax bits described earlier in this file.
+   4. The case-folding flag.
+   5. The line terminator.  */
+extern void dfasyntax (struct dfa *, struct localeinfo const *,
+                       reg_syntax_t, bool, unsigned char);
+
 /* Build and return the struct dfamust from the given struct dfa. */
 extern struct dfamust *dfamust (struct dfa const *);
 
 /* Free the storage held by the components of a struct dfamust. */
 extern void dfamustfree (struct dfamust *);
 
-/* dfasyntax() takes four arguments; the first is the dfa to operate on, the
-   second sets the syntax bits described earlier in this file, the third sets
-   the case-folding flag, and the fourth specifies the line terminator. */
-extern void dfasyntax (struct dfa *, reg_syntax_t, bool, unsigned char);
-
 /* Compile the given string of the given length into the given struct dfa.
    Final argument is a flag specifying whether to build a searching or an
    exact matcher. */
@@ -103,8 +110,3 @@ extern void dfawarn (const char *);
    takes a single argument, a NUL-terminated string describing the error.
    The user must supply a dfaerror.  */
 extern _Noreturn void dfaerror (const char *);
-
-extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE;
-
-/* This must be called before calling any of the above dfa*() functions. */
-extern void dfa_init (void);
diff --git a/localeinfo.c b/localeinfo.c
new file mode 100644
index 00000000..ca96afc7
--- /dev/null
+++ b/localeinfo.c
@@ -0,0 +1,113 @@
+/* locale information
+
+   Copyright 2016 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written by Paul Eggert.  */
+
+#include <config.h>
+
+#include <localeinfo.h>
+
+#include <verify.h>
+
+#include <limits.h>
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wctype.h>
+
+/* The sbclen implementation relies on this.  */
+verify (MB_LEN_MAX <= SCHAR_MAX);
+
+/* Return true if the locale uses UTF-8.  */
+
+static bool
+is_using_utf8 (void)
+{
+  wchar_t wc;
+  mbstate_t mbs = {0};
+  return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
+}
+
+/* Initialize *LOCALEINFO from the current locale.  */
+
+void
+init_localeinfo (struct localeinfo *localeinfo)
+{
+  int i;
+
+  localeinfo->multibyte = MB_CUR_MAX > 1;
+  localeinfo->using_utf8 = is_using_utf8 ();
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t s = {0};
+      wchar_t wc;
+      size_t len = mbrtowc (&wc, &c, 1, &s);
+      localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
+      localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
+    }
+}
+
+/* The set of wchar_t values C such that there's a useful locale
+   somewhere where C != towupper (C) && C != towlower (towupper (C)).
+   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
+static short const lonesome_lower[] =
+  {
+    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
+    0x03F2,
+
+    0x03F5, 0x1E9B, 0x1FBE,
+  };
+
+/* Verify that the worst case fits.  This is 1 for towupper, 1 for
+   towlower, and 1 for each entry in LONESOME_LOWER.  */
+verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
+        <= CASE_FOLDED_BUFSIZE);
+
+/* Find the characters equal to C after case-folding, other than C
+   itself, and store them into FOLDED.  Return the number of characters
+   stored.  */
+
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+  int i;
+  int n = 0;
+  wint_t uc = towupper (c);
+  wint_t lc = towlower (uc);
+  if (uc != c)
+    folded[n++] = uc;
+  if (lc != uc && lc != c && towupper (lc) == uc)
+    folded[n++] = lc;
+  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+    {
+      wint_t li = lonesome_lower[i];
+      if (li != lc && li != uc && li != c && towupper (li) == uc)
+        folded[n++] = li;
+    }
+  return n;
+}
diff --git a/localeinfo.h b/localeinfo.h
new file mode 100644
index 00000000..cf2f9a69
--- /dev/null
+++ b/localeinfo.h
@@ -0,0 +1,54 @@
+/* locale information
+
+   Copyright 2016 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written by Paul Eggert.  */
+
+#include <limits.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+struct localeinfo
+{
+  /* MB_CUR_MAX > 1.  */
+  bool multibyte;
+
+  /* The locale uses UTF-8.  */
+  bool using_utf8;
+
+  /* An array indexed by byte values B that contains 1 if B is a
+     single-byte character, -1 if B is an encoding error, and -2 if B
+     is the leading byte of a multibyte character that contains more
+     than one byte.  */
+  signed char sbclen[UCHAR_MAX + 1];
+
+  /* An array indexed by byte values B that contains the corresponding
+     wide character (if any) for B if sbclen[B] == 1.  WEOF means the
+     byte is not a valid single-byte character, i.e., sbclen[B] == -1
+     or -2.  */
+  wint_t sbctowc[UCHAR_MAX + 1];
+};
+
+extern void init_localeinfo (struct localeinfo *);
+
+/* Maximum number of characters that can be the case-folded
+   counterparts of a single character, not counting the character
+   itself.  This is a generous upper bound.  */
+enum { CASE_FOLDED_BUFSIZE = 32 };
+
+extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/node.c b/node.c
index c6c9af81..bb2fe437 100644
--- a/node.c
+++ b/node.c
@@ -752,7 +752,7 @@ str2wstr(NODE *n, size_t **ptr)
 			 * stopping early. This is particularly important
 			 * for match() where we need to build the indices.
 			 */
-			if (dfa_using_utf8()) {
+			if (using_utf8()) {
 				count = 1;
 				wc = 0xFFFD;	/* unicode replacement character */
 				goto set_wc;
diff --git a/re.c b/re.c
index c822c90f..6a100db0 100644
--- a/re.c
+++ b/re.c
@@ -25,10 +25,14 @@
 
 #include "awk.h"
 
+#include "localeinfo.h"
+
 static reg_syntax_t syn;
 static void check_bracket_exp(char *s, size_t len);
 const char *regexflags2str(int flags);
 
+static struct localeinfo localeinfo;
+
 /* make_regexp --- generate compiled regular expressions */
 
 Regexp *
@@ -223,7 +227,7 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 	rp->pat.newline_anchor = false; /* don't get \n in middle of string */
 	if (dfa && ! no_dfa) {
 		rp->dfareg = dfaalloc();
-		dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
+		dfasyntax(rp->dfareg, & localeinfo, dfa_syn, ignorecase, '\n');
 		dfacomp(buf, len, rp->dfareg, true);
 	} else
 		rp->dfareg = NULL;
@@ -395,6 +399,9 @@ re_update(NODE *t)
 void
 resetup()
 {
+	// init localeinfo for dfa
+	init_localeinfo(& localeinfo);
+
 	/*
 	 * Syntax bits: _that_ is yet another mind trip.  Recreational drugs
 	 * are helpful for recovering from the experience.
@@ -418,8 +425,14 @@ resetup()
 		syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
 
 	(void) re_set_syntax(syn);
+}
+
+/* using_utf8 --- are we using utf8 */
 
-	dfa_init();
+bool
+using_utf8(void)
+{
+	return localeinfo.using_utf8;
 }
 
 /* reisstring --- return true if the RE match is a simple string match */
diff --git a/verify.h b/verify.h
new file mode 100644
index 00000000..5c8381d2
--- /dev/null
+++ b/verify.h
@@ -0,0 +1,279 @@
+/* Compile-time assert-like macros.
+
+   Copyright (C) 2005-2006, 2009-2016 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Paul Eggert, Bruno Haible, and Jim Meyering.  */
+
+#ifndef _GL_VERIFY_H
+#define _GL_VERIFY_H
+
+
+/* Define _GL_HAVE__STATIC_ASSERT to 1 if _Static_assert works as per C11.
+   This is supported by GCC 4.6.0 and later, in C mode, and its use
+   here generates easier-to-read diagnostics when verify (R) fails.
+
+   Define _GL_HAVE_STATIC_ASSERT to 1 if static_assert works as per C++11.
+   This will likely be supported by future GCC versions, in C++ mode.
+
+   Use this only with GCC.  If we were willing to slow 'configure'
+   down we could also use it with other compilers, but since this
+   affects only the quality of diagnostics, why bother?  */
+#if (4 < __GNUC__ + (6 <= __GNUC_MINOR__) \
+     && (201112L <= __STDC_VERSION__  || !defined __STRICT_ANSI__) \
+     && !defined __cplusplus)
+# define _GL_HAVE__STATIC_ASSERT 1
+#endif
+/* The condition (99 < __GNUC__) is temporary, until we know about the
+   first G++ release that supports static_assert.  */
+#if (99 < __GNUC__) && defined __cplusplus
+# define _GL_HAVE_STATIC_ASSERT 1
+#endif
+
+/* FreeBSD 9.1 <sys/cdefs.h>, included by <stddef.h> and lots of other
+   system headers, defines a conflicting _Static_assert that is no
+   better than ours; override it.  */
+#ifndef _GL_HAVE_STATIC_ASSERT
+# include <stddef.h>
+# undef _Static_assert
+#endif
+
+/* Each of these macros verifies that its argument R is nonzero.  To
+   be portable, R should be an integer constant expression.  Unlike
+   assert (R), there is no run-time overhead.
+
+   If _Static_assert works, verify (R) uses it directly.  Similarly,
+   _GL_VERIFY_TRUE works by packaging a _Static_assert inside a struct
+   that is an operand of sizeof.
+
+   The code below uses several ideas for C++ compilers, and for C
+   compilers that do not support _Static_assert:
+
+   * The first step is ((R) ? 1 : -1).  Given an expression R, of
+     integral or boolean or floating-point type, this yields an
+     expression of integral type, whose value is later verified to be
+     constant and nonnegative.
+
+   * Next this expression W is wrapped in a type
+     struct _gl_verify_type {
+       unsigned int _gl_verify_error_if_negative: W;
+     }.
+     If W is negative, this yields a compile-time error.  No compiler can
+     deal with a bit-field of negative size.
+
+     One might think that an array size check would have the same
+     effect, that is, that the type struct { unsigned int dummy[W]; }
+     would work as well.  However, inside a function, some compilers
+     (such as C++ compilers and GNU C) allow local parameters and
+     variables inside array size expressions.  With these compilers,
+     an array size check would not properly diagnose this misuse of
+     the verify macro:
+
+       void function (int n) { verify (n < 0); }
+
+   * For the verify macro, the struct _gl_verify_type will need to
+     somehow be embedded into a declaration.  To be portable, this
+     declaration must declare an object, a constant, a function, or a
+     typedef name.  If the declared entity uses the type directly,
+     such as in
+
+       struct dummy {...};
+       typedef struct {...} dummy;
+       extern struct {...} *dummy;
+       extern void dummy (struct {...} *);
+       extern struct {...} *dummy (void);
+
+     two uses of the verify macro would yield colliding declarations
+     if the entity names are not disambiguated.  A workaround is to
+     attach the current line number to the entity name:
+
+       #define _GL_CONCAT0(x, y) x##y
+       #define _GL_CONCAT(x, y) _GL_CONCAT0 (x, y)
+       extern struct {...} * _GL_CONCAT (dummy, __LINE__);
+
+     But this has the problem that two invocations of verify from
+     within the same macro would collide, since the __LINE__ value
+     would be the same for both invocations.  (The GCC __COUNTER__
+     macro solves this problem, but is not portable.)
+
+     A solution is to use the sizeof operator.  It yields a number,
+     getting rid of the identity of the type.  Declarations like
+
+       extern int dummy [sizeof (struct {...})];
+       extern void dummy (int [sizeof (struct {...})]);
+       extern int (*dummy (void)) [sizeof (struct {...})];
+
+     can be repeated.
+
+   * Should the implementation use a named struct or an unnamed struct?
+     Which of the following alternatives can be used?
+
+       extern int dummy [sizeof (struct {...})];
+       extern int dummy [sizeof (struct _gl_verify_type {...})];
+       extern void dummy (int [sizeof (struct {...})]);
+       extern void dummy (int [sizeof (struct _gl_verify_type {...})]);
+       extern int (*dummy (void)) [sizeof (struct {...})];
+       extern int (*dummy (void)) [sizeof (struct _gl_verify_type {...})];
+
+     In the second and sixth case, the struct type is exported to the
+     outer scope; two such declarations therefore collide.  GCC warns
+     about the first, third, and fourth cases.  So the only remaining
+     possibility is the fifth case:
+
+       extern int (*dummy (void)) [sizeof (struct {...})];
+
+   * GCC warns about duplicate declarations of the dummy function if
+     -Wredundant-decls is used.  GCC 4.3 and later have a builtin
+     __COUNTER__ macro that can let us generate unique identifiers for
+     each dummy function, to suppress this warning.
+
+   * This implementation exploits the fact that older versions of GCC,
+     which do not support _Static_assert, also do not warn about the
+     last declaration mentioned above.
+
+   * GCC warns if -Wnested-externs is enabled and verify() is used
+     within a function body; but inside a function, you can always
+     arrange to use verify_expr() instead.
+
+   * In C++, any struct definition inside sizeof is invalid.
+     Use a template type to work around the problem.  */
+
+/* Concatenate two preprocessor tokens.  */
+#define _GL_CONCAT(x, y) _GL_CONCAT0 (x, y)
+#define _GL_CONCAT0(x, y) x##y
+
+/* _GL_COUNTER is an integer, preferably one that changes each time we
+   use it.  Use __COUNTER__ if it works, falling back on __LINE__
+   otherwise.  __LINE__ isn't perfect, but it's better than a
+   constant.  */
+#if defined __COUNTER__ && __COUNTER__ != __COUNTER__
+# define _GL_COUNTER __COUNTER__
+#else
+# define _GL_COUNTER __LINE__
+#endif
+
+/* Generate a symbol with the given prefix, making it unique if
+   possible.  */
+#define _GL_GENSYM(prefix) _GL_CONCAT (prefix, _GL_COUNTER)
+
+/* Verify requirement R at compile-time, as an integer constant expression
+   that returns 1.  If R is false, fail at compile-time, preferably
+   with a diagnostic that includes the string-literal DIAGNOSTIC.  */
+
+#define _GL_VERIFY_TRUE(R, DIAGNOSTIC) \
+   (!!sizeof (_GL_VERIFY_TYPE (R, DIAGNOSTIC)))
+
+#ifdef __cplusplus
+# if !GNULIB_defined_struct__gl_verify_type
+template <int w>
+  struct _gl_verify_type {
+    unsigned int _gl_verify_error_if_negative: w;
+  };
+#  define GNULIB_defined_struct__gl_verify_type 1
+# endif
+# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \
+    _gl_verify_type<(R) ? 1 : -1>
+#elif defined _GL_HAVE__STATIC_ASSERT
+# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \
+    struct {                                   \
+      _Static_assert (R, DIAGNOSTIC);          \
+      int _gl_dummy;                          \
+    }
+#else
+# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \
+    struct { unsigned int _gl_verify_error_if_negative: (R) ? 1 : -1; }
+#endif
+
+/* Verify requirement R at compile-time, as a declaration without a
+   trailing ';'.  If R is false, fail at compile-time, preferably
+   with a diagnostic that includes the string-literal DIAGNOSTIC.
+
+   Unfortunately, unlike C11, this implementation must appear as an
+   ordinary declaration, and cannot appear inside struct { ... }.  */
+
+#ifdef _GL_HAVE__STATIC_ASSERT
+# define _GL_VERIFY _Static_assert
+#else
+# define _GL_VERIFY(R, DIAGNOSTIC)				       \
+    extern int (*_GL_GENSYM (_gl_verify_function) (void))	       \
+      [_GL_VERIFY_TRUE (R, DIAGNOSTIC)]
+#endif
+
+/* _GL_STATIC_ASSERT_H is defined if this code is copied into assert.h.  */
+#ifdef _GL_STATIC_ASSERT_H
+# if !defined _GL_HAVE__STATIC_ASSERT && !defined _Static_assert
+#  define _Static_assert(R, DIAGNOSTIC) _GL_VERIFY (R, DIAGNOSTIC)
+# endif
+# if !defined _GL_HAVE_STATIC_ASSERT && !defined static_assert
+#  define static_assert _Static_assert /* C11 requires this #define.  */
+# endif
+#endif
+
+/* @assert.h omit start@  */
+
+/* Each of these macros verifies that its argument R is nonzero.  To
+   be portable, R should be an integer constant expression.  Unlike
+   assert (R), there is no run-time overhead.
+
+   There are two macros, since no single macro can be used in all
+   contexts in C.  verify_true (R) is for scalar contexts, including
+   integer constant expression contexts.  verify (R) is for declaration
+   contexts, e.g., the top level.  */
+
+/* Verify requirement R at compile-time, as an integer constant expression.
+   Return 1.  This is equivalent to verify_expr (R, 1).
+
+   verify_true is obsolescent; please use verify_expr instead.  */
+
+#define verify_true(R) _GL_VERIFY_TRUE (R, "verify_true (" #R ")")
+
+/* Verify requirement R at compile-time.  Return the value of the
+   expression E.  */
+
+#define verify_expr(R, E) \
+   (_GL_VERIFY_TRUE (R, "verify_expr (" #R ", " #E ")") ? (E) : (E))
+
+/* Verify requirement R at compile-time, as a declaration without a
+   trailing ';'.  */
+
+#define verify(R) _GL_VERIFY (R, "verify (" #R ")")
+
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
+/* Assume that R always holds.  This lets the compiler optimize
+   accordingly.  R should not have side-effects; it may or may not be
+   evaluated.  Behavior is undefined if R is false.  */
+
+#if (__has_builtin (__builtin_unreachable) \
+     || 4 < __GNUC__ + (5 <= __GNUC_MINOR__))
+# define assume(R) ((R) ? (void) 0 : __builtin_unreachable ())
+#elif 1200 <= _MSC_VER
+# define assume(R) __assume (R)
+#elif ((defined GCC_LINT || defined lint) \
+       && (__has_builtin (__builtin_trap) \
+           || 3 < __GNUC__ + (3 < __GNUC_MINOR__ + (4 <= __GNUC_PATCHLEVEL__))))
+  /* Doing it this way helps various packages when configured with
+     --enable-gcc-warnings, which compiles with -Dlint.  It's nicer
+     when 'assume' silences warnings even with older GCCs.  */
+# define assume(R) ((R) ? (void) 0 : __builtin_trap ())
+#else
+# define assume(R) ((void) (0 && (R)))
+#endif
+
+/* @assert.h omit end@  */
+
+#endif