aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog18
-rw-r--r--Makefile.am3
-rw-r--r--Makefile.in13
-rw-r--r--awk.h1
-rw-r--r--dfa.c316
-rw-r--r--dfa.h22
-rw-r--r--localeinfo.c113
-rw-r--r--localeinfo.h54
-rw-r--r--node.c2
-rw-r--r--re.c17
-rw-r--r--verify.h279
11 files changed, 622 insertions, 216 deletions
diff --git a/ChangeLog b/ChangeLog
index 023fa884..d3fcbdd3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,20 @@
-2016-08-29 Aharon Robbins <aharon.robbins@intel.com>
+2016-09-01 Arnold D. Robbins <arnold@skeeve.com>
+
+ Merge grep's now thread-safe dfa. Wheee.
+
+ * dfa.h, dfa.c: Sync with grep.
+ * localeinfo.h, localeinfo.c, verify.h: New files.
+ * Makefile.am (base_sources): Adjust.
+ * awk.h (using_utf8): Declare new function.
+ * node.c (str2wstr): Use using_utf8 instead of now-gone dfa function.
+ * re.c: Include "localeinfo.h".
+ (localeinfo): New static variable.
+ (make_regexp): Adjust call to dfa_syntax.
+ (resetup): Call init_localeinfo on localeinfo. Remove call to
+ now-gone function dfa_init.
+ (using_utf8): New function.
+
+2016-08-29 Arnold D. Robbins <arnold@skeeve.com>
* configure.ac (fwrite_unlocked): Check for it.
* awk.h (fwrite): Define to fwrite_unlocked if we have it.
diff --git a/Makefile.am b/Makefile.am
index dce65018..9acae0bc 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -109,6 +109,8 @@ base_sources = \
gettext.h \
int_array.c \
interpret.h \
+ localeinfo.c \
+ localeinfo.h \
io.c \
mbsupport.h \
main.c \
@@ -126,6 +128,7 @@ base_sources = \
replace.c \
str_array.c \
symbol.c \
+ verify.h \
version.c \
xalloc.h
diff --git a/Makefile.in b/Makefile.in
index 036361cb..f103a420 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -143,10 +143,11 @@ am__objects_1 = array.$(OBJEXT) awkgram.$(OBJEXT) builtin.$(OBJEXT) \
dfa.$(OBJEXT) eval.$(OBJEXT) ext.$(OBJEXT) field.$(OBJEXT) \
floatcomp.$(OBJEXT) gawkapi.$(OBJEXT) gawkmisc.$(OBJEXT) \
getopt.$(OBJEXT) getopt1.$(OBJEXT) int_array.$(OBJEXT) \
- io.$(OBJEXT) main.$(OBJEXT) mpfr.$(OBJEXT) msg.$(OBJEXT) \
- node.$(OBJEXT) profile.$(OBJEXT) random.$(OBJEXT) re.$(OBJEXT) \
- regex.$(OBJEXT) replace.$(OBJEXT) str_array.$(OBJEXT) \
- symbol.$(OBJEXT) version.$(OBJEXT)
+ localeinfo.$(OBJEXT) io.$(OBJEXT) main.$(OBJEXT) \
+ mpfr.$(OBJEXT) msg.$(OBJEXT) node.$(OBJEXT) profile.$(OBJEXT) \
+ random.$(OBJEXT) re.$(OBJEXT) regex.$(OBJEXT) \
+ replace.$(OBJEXT) str_array.$(OBJEXT) symbol.$(OBJEXT) \
+ version.$(OBJEXT)
am_gawk_OBJECTS = $(am__objects_1)
gawk_OBJECTS = $(am_gawk_OBJECTS)
gawk_LDADD = $(LDADD)
@@ -518,6 +519,8 @@ base_sources = \
gettext.h \
int_array.c \
interpret.h \
+ localeinfo.c \
+ localeinfo.h \
io.c \
mbsupport.h \
main.c \
@@ -535,6 +538,7 @@ base_sources = \
replace.c \
str_array.c \
symbol.c \
+ verify.h \
version.c \
xalloc.h
@@ -681,6 +685,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getopt1.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/int_array.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/io.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/localeinfo.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mpfr.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/msg.Po@am__quote@
diff --git a/awk.h b/awk.h
index edd9cb95..2c401637 100644
--- a/awk.h
+++ b/awk.h
@@ -1656,6 +1656,7 @@ extern void resyntax(int syntax);
extern void resetup(void);
extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf);
extern int get_numbase(const char *str, bool use_locale);
+extern bool using_utf8(void);
/* symbol.c */
extern void load_symbols();
diff --git a/dfa.c b/dfa.c
index 85cb46ad..fad03e4f 100644
--- a/dfa.c
+++ b/dfa.c
@@ -69,6 +69,8 @@
#include "dfa.h"
+#include "localeinfo.h"
+
#ifdef GAWK
static int
is_blank (int c)
@@ -445,14 +447,9 @@ struct dfa
size_t nregexps; /* Count of parallel regexps being built
with dfaparse. */
bool fast; /* The DFA is fast. */
- bool multibyte; /* MB_CUR_MAX > 1. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
- /* dfaexec implementation. */
- char *(*dfaexec) (struct dfa *, char const *, char *,
- bool, size_t *, bool *);
-
/* The following are valid only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule.
@@ -538,6 +535,21 @@ struct dfa
state_num **mb_trans; /* Transition tables for states with ANYCHAR. */
state_num mb_trcount; /* Number of transition tables for states with
ANYCHAR that have actually been built. */
+
+ /* Information derived from the locale. This is at the end so that
+ a quick memset need not clear it specially. */
+
+ /* dfaexec implementation. */
+ char *(*dfaexec) (struct dfa *, char const *, char *,
+ bool, size_t *, bool *);
+
+ /* The locale is simple, like the C locale. These locales can be
+ processed more efficiently, e.g., the relationship between lower-
+ and upper-case letters is 1-1. */
+ bool simple_locale;
+
+ /* Other cached information derived from the locale. */
+ struct localeinfo localeinfo;
};
/* Some macros for user access to dfa internals. */
@@ -551,13 +563,8 @@ struct dfa
static void regexp (struct dfa *dfa);
-/* A table indexed by byte values that contains the corresponding wide
- character (if any) for that byte. WEOF means the byte is not a
- valid single-byte character. */
-static wint_t mbrtowc_cache[NOTCHAR];
-
/* Store into *PWC the result of converting the leading bytes of the
- multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+ multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
and updating the conversion state in *D. On conversion error,
convert just a single byte, to WEOF. Return the number of bytes
converted.
@@ -566,7 +573,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
* PWC points to wint_t, not to wchar_t.
* The last arg is a dfa *D instead of merely a multibyte conversion
- state D->mbs. D also contains an mbrtowc_cache for speed.
+ state D->mbs.
* N must be at least 1.
* S[N - 1] must be a sentinel byte.
* Shift encodings are not supported.
@@ -577,7 +584,7 @@ static size_t
mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
{
unsigned char uc = s[0];
- wint_t wc = mbrtowc_cache[uc];
+ wint_t wc = d->localeinfo.sbctowc[uc];
if (wc == WEOF)
{
@@ -754,7 +761,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
/* In DFA D, find the index of charclass S, or allocate a new one. */
static size_t
-dfa_charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass const s)
{
size_t i;
@@ -769,9 +776,9 @@ dfa_charclass_index (struct dfa *d, charclass const s)
}
static bool
-unibyte_word_constituent (unsigned char c)
+unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
{
- return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+ return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
}
static int
@@ -779,68 +786,11 @@ char_context (struct dfa const *dfa, unsigned char c)
{
if (c == dfa->syntax.eolbyte)
return CTX_NEWLINE;
- if (unibyte_word_constituent (c))
+ if (unibyte_word_constituent (dfa, c))
return CTX_LETTER;
return CTX_NONE;
}
-/* UTF-8 encoding allows some optimizations that we can't otherwise
- assume in a multibyte encoding. */
-static bool using_utf8;
-
-bool
-dfa_using_utf8 (void)
-{
- return using_utf8;
-}
-
-static void
-init_mbrtowc_cache (void)
-{
- int i;
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- char c = i;
- unsigned char uc = i;
- mbstate_t s = { 0 };
- wchar_t wc;
- mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
- }
-}
-
-/* Entry point to set syntax options. */
-void
-dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
-{
- int i;
- dfa->syntax.syntax_bits_set = true;
- dfa->syntax.syntax_bits = bits;
- dfa->syntax.case_fold = fold;
- dfa->syntax.eolbyte = eol;
-
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- unsigned char uc = i;
-
- /* Use mbrtowc_cache to calculate sbit. */
- dfa->syntax.sbit[uc] = char_context (dfa, uc);
- switch (dfa->syntax.sbit[uc])
- {
- case CTX_LETTER:
- setbit (uc, dfa->syntax.letters);
- break;
- case CTX_NEWLINE:
- setbit (uc, dfa->syntax.newline);
- break;
- }
-
- /* POSIX requires that the five bytes in "\n\r./" (including the
- terminating NUL) cannot occur inside a multibyte character. */
- dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
- : strchr ("\n\r./", uc) != NULL);
- }
-}
-
/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
this may happen when folding case in weird Turkish locales where
@@ -869,30 +819,10 @@ setbit_case_fold_c (int b, charclass c)
setbit (i, c);
}
-static void check_utf8 (void)
-{
- wchar_t wc;
- mbstate_t mbs = { 0 };
- using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-}
-
-static bool unibyte_c;
-
-static void check_unibyte_c (void)
-{
- char const *locale = setlocale (LC_ALL, NULL);
- unibyte_c = (!locale
- || STREQ (locale, "C")
- || STREQ (locale, "POSIX"));
-}
-
-/* The current locale is known to be a unibyte locale
- without multicharacter collating sequences and where range
- comparisons simply use the native encoding. These locales can be
- processed more efficiently. */
+/* Return true if the locale compatible with the C locale. */
static bool
-using_simple_locale (struct dfa const *dfa)
+using_simple_locale (bool multibyte)
{
/* The native character set is known to be compatible with
the C locale. The following test isn't perfect, but it's good
@@ -910,7 +840,15 @@ using_simple_locale (struct dfa const *dfa)
&& '}' == 125 && '~' == 126)
};
- return (native_c_charset & !dfa->multibyte) | unibyte_c;
+ if (native_c_charset && !multibyte)
+ return true;
+ else
+ {
+ /* Treat C and POSIX locales as being compatible. Also, treat
+ errors as compatible, as these are invariably from stubs. */
+ char const *loc = setlocale (LC_ALL, NULL);
+ return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX");
+ }
}
/* Fetch the next lexical input character. Set C (of type int) to the
@@ -946,53 +884,6 @@ using_simple_locale (struct dfa const *dfa)
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
-/* The set of wchar_t values C such that there's a useful locale
- somewhere where C != towupper (C) && C != towlower (towupper (C)).
- For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
- towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
- towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
-static short const lonesome_lower[] =
- {
- 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
- 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
- /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
- counterpart in locales predating Unicode 4.0.0 (April 2003). */
- 0x03F2,
-
- 0x03F5, 0x1E9B, 0x1FBE,
- };
-
-/* Maximum number of characters that can be the case-folded
- counterparts of a single character, not counting the character
- itself. This is 1 for towupper, 1 for towlower, and 1 for each
- entry in LONESOME_LOWER. */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
- itself, and store them into FOLDED. Return the number of characters
- stored. */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
- unsigned int i;
- unsigned int n = 0;
- wint_t uc = towupper (c);
- wint_t lc = towlower (uc);
- if (uc != c)
- folded[n++] = uc;
- if (lc != uc && lc != c && towupper (lc) == uc)
- folded[n++] = lc;
- for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
- {
- wint_t li = lonesome_lower[i];
- if (li != lc && li != uc && li != c && towupper (li) == uc)
- folded[n++] = li;
- }
- return n;
-}
-
typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
@@ -1061,7 +952,7 @@ parse_bracket_exp (struct dfa *dfa)
size_t chars_al;
chars_al = 0;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
&dfa->mbcsets_alloc,
@@ -1084,7 +975,7 @@ parse_bracket_exp (struct dfa *dfa)
{
FETCH_WC (dfa, c, wc, _("unbalanced ["));
invert = true;
- known_bracket_exp = using_simple_locale (dfa);
+ known_bracket_exp = dfa->simple_locale;
}
else
invert = false;
@@ -1139,7 +1030,7 @@ parse_bracket_exp (struct dfa *dfa)
if (!pred)
dfaerror (_("invalid character class"));
- if (dfa->multibyte && !pred->single_byte_only)
+ if (dfa->localeinfo.multibyte && !pred->single_byte_only)
known_bracket_exp = false;
else
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1199,9 +1090,9 @@ parse_bracket_exp (struct dfa *dfa)
/* Treat [x-y] as a range if x != y. */
if (wc != wc2 || wc == WEOF)
{
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
known_bracket_exp = false;
- else if (using_simple_locale (dfa))
+ else if (dfa->simple_locale)
{
int ci;
for (ci = c; ci <= c2; ci++)
@@ -1228,7 +1119,7 @@ parse_bracket_exp (struct dfa *dfa)
colon_warning_state |= (c == ':') ? 2 : 4;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
if (dfa->syntax.case_fold)
setbit_case_fold_c (c, ccl);
@@ -1265,22 +1156,22 @@ parse_bracket_exp (struct dfa *dfa)
if (! known_bracket_exp)
return BACKREF;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
work_mbc->invert = invert;
- work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
+ work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
return MBCSET;
}
if (invert)
{
- assert (!dfa->multibyte);
+ assert (!dfa->localeinfo.multibyte);
notset (ccl);
if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit ('\n', ccl);
}
- return CSET + dfa_charclass_index (dfa, ccl);
+ return CSET + charclass_index (dfa, ccl);
}
struct lexptr
@@ -1535,7 +1426,7 @@ lex (struct dfa *dfa)
case '.':
if (backslash)
goto normal_char;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
/* In multibyte environment period must match with a single
character not a byte. So we use ANYCHAR. */
@@ -1549,13 +1440,13 @@ lex (struct dfa *dfa)
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
case 's':
case 'S':
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1564,7 +1455,7 @@ lex (struct dfa *dfa)
if (c == 'S')
notset (ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1588,16 +1479,16 @@ lex (struct dfa *dfa)
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
- if (unibyte_word_constituent (c2))
+ if (unibyte_word_constituent (dfa, c2))
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1627,14 +1518,14 @@ lex (struct dfa *dfa)
dfa->lex.laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
return dfa->lex.lasttok = WCHAR;
if (dfa->syntax.case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
return dfa->lex.lasttok = c;
@@ -1654,11 +1545,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
{
dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
sizeof *dfa->tokens);
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
sizeof *dfa->multibyte_prop);
}
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop[dfa->tindex] = mbprop;
dfa->tokens[dfa->tindex++] = t;
@@ -1695,7 +1586,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc);
static void
addtok (struct dfa *dfa, token t)
{
- if (dfa->multibyte && t == MBCSET)
+ if (dfa->localeinfo.multibyte && t == MBCSET)
{
bool need_or = false;
struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1794,7 +1685,7 @@ add_utf8_anychar (struct dfa *dfa)
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', c);
}
- dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
+ dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
}
/* A valid UTF-8 character is
@@ -1878,7 +1769,7 @@ atom (struct dfa *dfa)
dfa->parse.tok = lex (dfa);
}
- else if (dfa->parse.tok == ANYCHAR && using_utf8)
+ else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1939,7 +1830,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
{
size_t i;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
for (i = 0; i < ntokens; ++i)
addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
else
@@ -2025,7 +1916,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
d->lex.lasttok = END;
d->lex.laststart = true;
d->lex.parens = 0;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
d->lex.cur_mb_len = 0;
memset (&d->mbs, 0, sizeof d->mbs);
@@ -2214,7 +2105,7 @@ state_index (struct dfa *d, position_set const *s, int context)
}
else if (d->tokens[s->elems[j].index] == BACKREF)
constraint = NO_CONSTRAINT;
- if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+ if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
{
int acceptable
= ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2691,7 +2582,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
setbit (d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+ else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
{
/* ANYCHAR must match a single character, so put it to
D->states[s].mbps which contains the positions which can
@@ -2837,7 +2728,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_letter = state;
for (i = 0; i < NOTCHAR; ++i)
- trans[i] = unibyte_word_constituent (i) ? state_letter : state;
+ trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
trans[d->syntax.eolbyte] = state_newline;
}
else
@@ -2854,7 +2745,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
insert (d->follows[grps[i].elems[j]].elems[k], &follows);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* If a token in follows.elems is not 1st byte of a multibyte
character, or the states of follows must accept the bytes
@@ -2887,7 +2778,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* If we are building a searching matcher, throw in the positions
of state 0 as well. */
- if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
+ if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
{
merge (&d->states[0].elems, &follows, &tmp);
copy (&tmp, &follows);
@@ -2943,7 +2834,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
if (c == d->syntax.eolbyte)
trans[c] = state_newline;
- else if (unibyte_word_constituent (c))
+ else if (unibyte_word_constituent (d, c))
trans[c] = state_letter;
else if (c < NOTCHAR)
trans[c] = state;
@@ -2984,7 +2875,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
@@ -2996,7 +2887,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
{
d->trans[oldalloc] = NULL;
d->fails[oldalloc] = NULL;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
d->mb_trans[oldalloc] = NULL;
}
}
@@ -3030,7 +2921,7 @@ build_state (state_num s, struct dfa *d)
}
d->trcount = d->min_trcount;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
for (i = d->min_trcount; i < d->tralloc; i++)
{
@@ -3481,7 +3372,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end,
return (char *) begin;
}
-/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte),
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
but faster and set *BACKREF if the DFA code does not support this
regexp usage. */
@@ -3539,7 +3430,7 @@ dfa_supported (struct dfa const *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (!d->multibyte)
+ if (!d->localeinfo.multibyte)
continue;
/* fallthrough */
@@ -3557,7 +3448,7 @@ dfaoptimize (struct dfa *d)
size_t i;
bool have_backref = false;
- if (!using_utf8)
+ if (!d->localeinfo.using_utf8)
return;
for (i = 0; i < d->tindex; ++i)
@@ -3587,7 +3478,7 @@ dfaoptimize (struct dfa *d)
}
free_mbdata (d);
- d->multibyte = false;
+ d->localeinfo.multibyte = false;
d->dfaexec = dfaexec_sb;
d->fast = true;
}
@@ -3602,7 +3493,7 @@ dfassbuild (struct dfa *d)
struct dfa *sup = dfaalloc ();
*sup = *d;
- sup->multibyte = false;
+ sup->localeinfo.multibyte = false;
sup->dfaexec = dfaexec_sb;
sup->multibyte_prop = NULL;
sup->mbcsets = NULL;
@@ -3635,7 +3526,7 @@ dfassbuild (struct dfa *d)
case BACKREF:
zeroset (ccl);
notset (ccl);
- sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+ sup->tokens[j++] = CSET + charclass_index (sup, ccl);
sup->tokens[j++] = STAR;
if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
|| d->tokens[i + 1] == PLUS)
@@ -3646,7 +3537,7 @@ dfassbuild (struct dfa *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* These constraints aren't supported in a multibyte locale.
Ignore them in the superset DFA. */
@@ -3663,7 +3554,7 @@ dfassbuild (struct dfa *d)
}
sup->tindex = j;
- if (have_nchar && (have_achar || d->multibyte))
+ if (have_nchar && (have_achar || d->localeinfo.multibyte))
d->superset = sup;
else
{
@@ -3705,7 +3596,7 @@ dfafree (struct dfa *d)
free (d->charclasses);
free (d->tokens);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
free_mbdata (d);
for (i = 0; i < d->sindex; ++i)
@@ -4227,20 +4118,49 @@ dfamustfree (struct dfamust *dm)
struct dfa *
dfaalloc (void)
{
- struct dfa *d = xzalloc (sizeof *d);
- d->multibyte = MB_CUR_MAX > 1;
- d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
- d->fast = !d->multibyte;
- d->lex.cur_mb_len = 1;
- return d;
+ return xmalloc (sizeof (struct dfa));
}
+/* Initialize DFA. */
void
-dfa_init (void)
+dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
+ reg_syntax_t bits, bool fold, unsigned char eol)
{
- check_utf8 ();
- check_unibyte_c ();
- init_mbrtowc_cache ();
+ int i;
+ memset (dfa, 0, offsetof (struct dfa, dfaexec));
+ dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
+ dfa->simple_locale = using_simple_locale (linfo->multibyte);
+ dfa->localeinfo = *linfo;
+
+ dfa->fast = !dfa->localeinfo.multibyte;
+
+ dfa->lex.cur_mb_len = 1;
+ dfa->syntax.syntax_bits_set = true;
+ dfa->syntax.syntax_bits = bits;
+ dfa->syntax.case_fold = fold;
+ dfa->syntax.eolbyte = eol;
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ unsigned char uc = i;
+
+ dfa->syntax.sbit[uc] = char_context (dfa, uc);
+ switch (dfa->syntax.sbit[uc])
+ {
+ case CTX_LETTER:
+ setbit (uc, dfa->syntax.letters);
+ break;
+ case CTX_NEWLINE:
+ setbit (uc, dfa->syntax.newline);
+ break;
+ }
+
+ /* POSIX requires that the five bytes in "\n\r./" (including the
+ terminating NUL) cannot occur inside a multibyte character. */
+ dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
+ ? (uc & 0xc0) != 0x80
+ : strchr ("\n\r./", uc) != NULL);
+ }
}
/* vim:set shiftwidth=2: */
diff --git a/dfa.h b/dfa.h
index 02f56f44..1fd37ec9 100644
--- a/dfa.h
+++ b/dfa.h
@@ -28,6 +28,8 @@
#define _GL_ATTRIBUTE_MALLOC
+struct localeinfo; /* See localeinfo.h. */
+
/* Element of a list of strings, at least one of which is known to
appear in any R.E. matching the DFA. */
struct dfamust
@@ -48,17 +50,22 @@ struct dfa;
calling dfafree() on it. */
extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC;
+/* Initialize or reinitialize a DFA. This must be called before
+ any of the routines below. The arguments are:
+ 1. The DFA to operate on.
+ 2. Information about the current locale.
+ 3. The syntax bits described earlier in this file.
+ 4. The case-folding flag.
+ 5. The line terminator. */
+extern void dfasyntax (struct dfa *, struct localeinfo const *,
+ reg_syntax_t, bool, unsigned char);
+
/* Build and return the struct dfamust from the given struct dfa. */
extern struct dfamust *dfamust (struct dfa const *);
/* Free the storage held by the components of a struct dfamust. */
extern void dfamustfree (struct dfamust *);
-/* dfasyntax() takes four arguments; the first is the dfa to operate on, the
- second sets the syntax bits described earlier in this file, the third sets
- the case-folding flag, and the fourth specifies the line terminator. */
-extern void dfasyntax (struct dfa *, reg_syntax_t, bool, unsigned char);
-
/* Compile the given string of the given length into the given struct dfa.
Final argument is a flag specifying whether to build a searching or an
exact matcher. */
@@ -103,8 +110,3 @@ extern void dfawarn (const char *);
takes a single argument, a NUL-terminated string describing the error.
The user must supply a dfaerror. */
extern _Noreturn void dfaerror (const char *);
-
-extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE;
-
-/* This must be called before calling any of the above dfa*() functions. */
-extern void dfa_init (void);
diff --git a/localeinfo.c b/localeinfo.c
new file mode 100644
index 00000000..ca96afc7
--- /dev/null
+++ b/localeinfo.c
@@ -0,0 +1,113 @@
+/* locale information
+
+ Copyright 2016 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Written by Paul Eggert. */
+
+#include <config.h>
+
+#include <localeinfo.h>
+
+#include <verify.h>
+
+#include <limits.h>
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wctype.h>
+
+/* The sbclen implementation relies on this. */
+verify (MB_LEN_MAX <= SCHAR_MAX);
+
+/* Return true if the locale uses UTF-8. */
+
+static bool
+is_using_utf8 (void)
+{
+ wchar_t wc;
+ mbstate_t mbs = {0};
+ return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
+}
+
+/* Initialize *LOCALEINFO from the current locale. */
+
+void
+init_localeinfo (struct localeinfo *localeinfo)
+{
+ int i;
+
+ localeinfo->multibyte = MB_CUR_MAX > 1;
+ localeinfo->using_utf8 = is_using_utf8 ();
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+ {
+ char c = i;
+ unsigned char uc = i;
+ mbstate_t s = {0};
+ wchar_t wc;
+ size_t len = mbrtowc (&wc, &c, 1, &s);
+ localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
+ localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
+ }
+}
+
+/* The set of wchar_t values C such that there's a useful locale
+ somewhere where C != towupper (C) && C != towlower (towupper (C)).
+ For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+ towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+ towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
+static short const lonesome_lower[] =
+ {
+ 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+ 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+ /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+ counterpart in locales predating Unicode 4.0.0 (April 2003). */
+ 0x03F2,
+
+ 0x03F5, 0x1E9B, 0x1FBE,
+ };
+
+/* Verify that the worst case fits. This is 1 for towupper, 1 for
+ towlower, and 1 for each entry in LONESOME_LOWER. */
+verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
+ <= CASE_FOLDED_BUFSIZE);
+
+/* Find the characters equal to C after case-folding, other than C
+ itself, and store them into FOLDED. Return the number of characters
+ stored. */
+
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+ int i;
+ int n = 0;
+ wint_t uc = towupper (c);
+ wint_t lc = towlower (uc);
+ if (uc != c)
+ folded[n++] = uc;
+ if (lc != uc && lc != c && towupper (lc) == uc)
+ folded[n++] = lc;
+ for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+ {
+ wint_t li = lonesome_lower[i];
+ if (li != lc && li != uc && li != c && towupper (li) == uc)
+ folded[n++] = li;
+ }
+ return n;
+}
diff --git a/localeinfo.h b/localeinfo.h
new file mode 100644
index 00000000..cf2f9a69
--- /dev/null
+++ b/localeinfo.h
@@ -0,0 +1,54 @@
+/* locale information
+
+ Copyright 2016 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Written by Paul Eggert. */
+
+#include <limits.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+struct localeinfo
+{
+ /* MB_CUR_MAX > 1. */
+ bool multibyte;
+
+ /* The locale uses UTF-8. */
+ bool using_utf8;
+
+ /* An array indexed by byte values B that contains 1 if B is a
+ single-byte character, -1 if B is an encoding error, and -2 if B
+ is the leading byte of a multibyte character that contains more
+ than one byte. */
+ signed char sbclen[UCHAR_MAX + 1];
+
+ /* An array indexed by byte values B that contains the corresponding
+ wide character (if any) for B if sbclen[B] == 1. WEOF means the
+ byte is not a valid single-byte character, i.e., sbclen[B] == -1
+ or -2. */
+ wint_t sbctowc[UCHAR_MAX + 1];
+};
+
+extern void init_localeinfo (struct localeinfo *);
+
+/* Maximum number of characters that can be the case-folded
+ counterparts of a single character, not counting the character
+ itself. This is a generous upper bound. */
+enum { CASE_FOLDED_BUFSIZE = 32 };
+
+extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/node.c b/node.c
index c6c9af81..bb2fe437 100644
--- a/node.c
+++ b/node.c
@@ -752,7 +752,7 @@ str2wstr(NODE *n, size_t **ptr)
* stopping early. This is particularly important
* for match() where we need to build the indices.
*/
- if (dfa_using_utf8()) {
+ if (using_utf8()) {
count = 1;
wc = 0xFFFD; /* unicode replacement character */
goto set_wc;
diff --git a/re.c b/re.c
index c822c90f..6a100db0 100644
--- a/re.c
+++ b/re.c
@@ -25,10 +25,14 @@
#include "awk.h"
+#include "localeinfo.h"
+
static reg_syntax_t syn;
static void check_bracket_exp(char *s, size_t len);
const char *regexflags2str(int flags);
+static struct localeinfo localeinfo;
+
/* make_regexp --- generate compiled regular expressions */
Regexp *
@@ -223,7 +227,7 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
rp->pat.newline_anchor = false; /* don't get \n in middle of string */
if (dfa && ! no_dfa) {
rp->dfareg = dfaalloc();
- dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
+ dfasyntax(rp->dfareg, & localeinfo, dfa_syn, ignorecase, '\n');
dfacomp(buf, len, rp->dfareg, true);
} else
rp->dfareg = NULL;
@@ -395,6 +399,9 @@ re_update(NODE *t)
void
resetup()
{
+ // init localeinfo for dfa
+ init_localeinfo(& localeinfo);
+
/*
* Syntax bits: _that_ is yet another mind trip. Recreational drugs
* are helpful for recovering from the experience.
@@ -418,8 +425,14 @@ resetup()
syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
(void) re_set_syntax(syn);
+}
+
+/* using_utf8 --- are we using utf8 */
- dfa_init();
+bool
+using_utf8(void)
+{
+ return localeinfo.using_utf8;
}
/* reisstring --- return true if the RE match is a simple string match */
diff --git a/verify.h b/verify.h
new file mode 100644
index 00000000..5c8381d2
--- /dev/null
+++ b/verify.h
@@ -0,0 +1,279 @@
+/* Compile-time assert-like macros.
+
+ Copyright (C) 2005-2006, 2009-2016 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* Written by Paul Eggert, Bruno Haible, and Jim Meyering. */
+
+#ifndef _GL_VERIFY_H
+#define _GL_VERIFY_H
+
+
+/* Define _GL_HAVE__STATIC_ASSERT to 1 if _Static_assert works as per C11.
+ This is supported by GCC 4.6.0 and later, in C mode, and its use
+ here generates easier-to-read diagnostics when verify (R) fails.
+
+ Define _GL_HAVE_STATIC_ASSERT to 1 if static_assert works as per C++11.
+ This will likely be supported by future GCC versions, in C++ mode.
+
+ Use this only with GCC. If we were willing to slow 'configure'
+ down we could also use it with other compilers, but since this
+ affects only the quality of diagnostics, why bother? */
+#if (4 < __GNUC__ + (6 <= __GNUC_MINOR__) \
+ && (201112L <= __STDC_VERSION__ || !defined __STRICT_ANSI__) \
+ && !defined __cplusplus)
+# define _GL_HAVE__STATIC_ASSERT 1
+#endif
+/* The condition (99 < __GNUC__) is temporary, until we know about the
+ first G++ release that supports static_assert. */
+#if (99 < __GNUC__) && defined __cplusplus
+# define _GL_HAVE_STATIC_ASSERT 1
+#endif
+
+/* FreeBSD 9.1 <sys/cdefs.h>, included by <stddef.h> and lots of other
+ system headers, defines a conflicting _Static_assert that is no
+ better than ours; override it. */
+#ifndef _GL_HAVE_STATIC_ASSERT
+# include <stddef.h>
+# undef _Static_assert
+#endif
+
+/* Each of these macros verifies that its argument R is nonzero. To
+ be portable, R should be an integer constant expression. Unlike
+ assert (R), there is no run-time overhead.
+
+ If _Static_assert works, verify (R) uses it directly. Similarly,
+ _GL_VERIFY_TRUE works by packaging a _Static_assert inside a struct
+ that is an operand of sizeof.
+
+ The code below uses several ideas for C++ compilers, and for C
+ compilers that do not support _Static_assert:
+
+ * The first step is ((R) ? 1 : -1). Given an expression R, of
+ integral or boolean or floating-point type, this yields an
+ expression of integral type, whose value is later verified to be
+ constant and nonnegative.
+
+ * Next this expression W is wrapped in a type
+ struct _gl_verify_type {
+ unsigned int _gl_verify_error_if_negative: W;
+ }.
+ If W is negative, this yields a compile-time error. No compiler can
+ deal with a bit-field of negative size.
+
+ One might think that an array size check would have the same
+ effect, that is, that the type struct { unsigned int dummy[W]; }
+ would work as well. However, inside a function, some compilers
+ (such as C++ compilers and GNU C) allow local parameters and
+ variables inside array size expressions. With these compilers,
+ an array size check would not properly diagnose this misuse of
+ the verify macro:
+
+ void function (int n) { verify (n < 0); }
+
+ * For the verify macro, the struct _gl_verify_type will need to
+ somehow be embedded into a declaration. To be portable, this
+ declaration must declare an object, a constant, a function, or a
+ typedef name. If the declared entity uses the type directly,
+ such as in
+
+ struct dummy {...};
+ typedef struct {...} dummy;
+ extern struct {...} *dummy;
+ extern void dummy (struct {...} *);
+ extern struct {...} *dummy (void);
+
+ two uses of the verify macro would yield colliding declarations
+ if the entity names are not disambiguated. A workaround is to
+ attach the current line number to the entity name:
+
+ #define _GL_CONCAT0(x, y) x##y
+ #define _GL_CONCAT(x, y) _GL_CONCAT0 (x, y)
+ extern struct {...} * _GL_CONCAT (dummy, __LINE__);
+
+ But this has the problem that two invocations of verify from
+ within the same macro would collide, since the __LINE__ value
+ would be the same for both invocations. (The GCC __COUNTER__
+ macro solves this problem, but is not portable.)
+
+ A solution is to use the sizeof operator. It yields a number,
+ getting rid of the identity of the type. Declarations like
+
+ extern int dummy [sizeof (struct {...})];
+ extern void dummy (int [sizeof (struct {...})]);
+ extern int (*dummy (void)) [sizeof (struct {...})];
+
+ can be repeated.
+
+ * Should the implementation use a named struct or an unnamed struct?
+ Which of the following alternatives can be used?
+
+ extern int dummy [sizeof (struct {...})];
+ extern int dummy [sizeof (struct _gl_verify_type {...})];
+ extern void dummy (int [sizeof (struct {...})]);
+ extern void dummy (int [sizeof (struct _gl_verify_type {...})]);
+ extern int (*dummy (void)) [sizeof (struct {...})];
+ extern int (*dummy (void)) [sizeof (struct _gl_verify_type {...})];
+
+ In the second and sixth case, the struct type is exported to the
+ outer scope; two such declarations therefore collide. GCC warns
+ about the first, third, and fourth cases. So the only remaining
+ possibility is the fifth case:
+
+ extern int (*dummy (void)) [sizeof (struct {...})];
+
+ * GCC warns about duplicate declarations of the dummy function if
+ -Wredundant-decls is used. GCC 4.3 and later have a builtin
+ __COUNTER__ macro that can let us generate unique identifiers for
+ each dummy function, to suppress this warning.
+
+ * This implementation exploits the fact that older versions of GCC,
+ which do not support _Static_assert, also do not warn about the
+ last declaration mentioned above.
+
+ * GCC warns if -Wnested-externs is enabled and verify() is used
+ within a function body; but inside a function, you can always
+ arrange to use verify_expr() instead.
+
+ * In C++, any struct definition inside sizeof is invalid.
+ Use a template type to work around the problem. */
+
+/* Concatenate two preprocessor tokens. */
+#define _GL_CONCAT(x, y) _GL_CONCAT0 (x, y)
+#define _GL_CONCAT0(x, y) x##y
+
+/* _GL_COUNTER is an integer, preferably one that changes each time we
+ use it. Use __COUNTER__ if it works, falling back on __LINE__
+ otherwise. __LINE__ isn't perfect, but it's better than a
+ constant. */
+#if defined __COUNTER__ && __COUNTER__ != __COUNTER__
+# define _GL_COUNTER __COUNTER__
+#else
+# define _GL_COUNTER __LINE__
+#endif
+
+/* Generate a symbol with the given prefix, making it unique if
+ possible. */
+#define _GL_GENSYM(prefix) _GL_CONCAT (prefix, _GL_COUNTER)
+
+/* Verify requirement R at compile-time, as an integer constant expression
+ that returns 1. If R is false, fail at compile-time, preferably
+ with a diagnostic that includes the string-literal DIAGNOSTIC. */
+
+#define _GL_VERIFY_TRUE(R, DIAGNOSTIC) \
+ (!!sizeof (_GL_VERIFY_TYPE (R, DIAGNOSTIC)))
+
+#ifdef __cplusplus
+# if !GNULIB_defined_struct__gl_verify_type
+template <int w>
+ struct _gl_verify_type {
+ unsigned int _gl_verify_error_if_negative: w;
+ };
+# define GNULIB_defined_struct__gl_verify_type 1
+# endif
+# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \
+ _gl_verify_type<(R) ? 1 : -1>
+#elif defined _GL_HAVE__STATIC_ASSERT
+# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \
+ struct { \
+ _Static_assert (R, DIAGNOSTIC); \
+ int _gl_dummy; \
+ }
+#else
+# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \
+ struct { unsigned int _gl_verify_error_if_negative: (R) ? 1 : -1; }
+#endif
+
+/* Verify requirement R at compile-time, as a declaration without a
+ trailing ';'. If R is false, fail at compile-time, preferably
+ with a diagnostic that includes the string-literal DIAGNOSTIC.
+
+ Unfortunately, unlike C11, this implementation must appear as an
+ ordinary declaration, and cannot appear inside struct { ... }. */
+
+#ifdef _GL_HAVE__STATIC_ASSERT
+# define _GL_VERIFY _Static_assert
+#else
+# define _GL_VERIFY(R, DIAGNOSTIC) \
+ extern int (*_GL_GENSYM (_gl_verify_function) (void)) \
+ [_GL_VERIFY_TRUE (R, DIAGNOSTIC)]
+#endif
+
+/* _GL_STATIC_ASSERT_H is defined if this code is copied into assert.h. */
+#ifdef _GL_STATIC_ASSERT_H
+# if !defined _GL_HAVE__STATIC_ASSERT && !defined _Static_assert
+# define _Static_assert(R, DIAGNOSTIC) _GL_VERIFY (R, DIAGNOSTIC)
+# endif
+# if !defined _GL_HAVE_STATIC_ASSERT && !defined static_assert
+# define static_assert _Static_assert /* C11 requires this #define. */
+# endif
+#endif
+
+/* @assert.h omit start@ */
+
+/* Each of these macros verifies that its argument R is nonzero. To
+ be portable, R should be an integer constant expression. Unlike
+ assert (R), there is no run-time overhead.
+
+ There are two macros, since no single macro can be used in all
+ contexts in C. verify_true (R) is for scalar contexts, including
+ integer constant expression contexts. verify (R) is for declaration
+ contexts, e.g., the top level. */
+
+/* Verify requirement R at compile-time, as an integer constant expression.
+ Return 1. This is equivalent to verify_expr (R, 1).
+
+ verify_true is obsolescent; please use verify_expr instead. */
+
+#define verify_true(R) _GL_VERIFY_TRUE (R, "verify_true (" #R ")")
+
+/* Verify requirement R at compile-time. Return the value of the
+ expression E. */
+
+#define verify_expr(R, E) \
+ (_GL_VERIFY_TRUE (R, "verify_expr (" #R ", " #E ")") ? (E) : (E))
+
+/* Verify requirement R at compile-time, as a declaration without a
+ trailing ';'. */
+
+#define verify(R) _GL_VERIFY (R, "verify (" #R ")")
+
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
+/* Assume that R always holds. This lets the compiler optimize
+ accordingly. R should not have side-effects; it may or may not be
+ evaluated. Behavior is undefined if R is false. */
+
+#if (__has_builtin (__builtin_unreachable) \
+ || 4 < __GNUC__ + (5 <= __GNUC_MINOR__))
+# define assume(R) ((R) ? (void) 0 : __builtin_unreachable ())
+#elif 1200 <= _MSC_VER
+# define assume(R) __assume (R)
+#elif ((defined GCC_LINT || defined lint) \
+ && (__has_builtin (__builtin_trap) \
+ || 3 < __GNUC__ + (3 < __GNUC_MINOR__ + (4 <= __GNUC_PATCHLEVEL__))))
+ /* Doing it this way helps various packages when configured with
+ --enable-gcc-warnings, which compiles with -Dlint. It's nicer
+ when 'assume' silences warnings even with older GCCs. */
+# define assume(R) ((R) ? (void) 0 : __builtin_trap ())
+#else
+# define assume(R) ((void) (0 && (R)))
+#endif
+
+/* @assert.h omit end@ */
+
+#endif