diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2016-09-01 20:47:20 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2016-09-01 20:47:20 +0300 |
commit | 96f23ecad8027eae6b7cde2b219dc783e6b64814 (patch) | |
tree | 8f127617a3d0f28c87b733faf52121145be025c3 | |
parent | ca45449843f55136af05494ca60760033ae81838 (diff) | |
parent | b02f580f06996bd88f741f9c7330aff79216a169 (diff) | |
download | egawk-96f23ecad8027eae6b7cde2b219dc783e6b64814.tar.gz egawk-96f23ecad8027eae6b7cde2b219dc783e6b64814.tar.bz2 egawk-96f23ecad8027eae6b7cde2b219dc783e6b64814.zip |
Merge branch 'master' into feature/nocopy
-rw-r--r-- | ChangeLog | 18 | ||||
-rw-r--r-- | Makefile.am | 3 | ||||
-rw-r--r-- | Makefile.in | 13 | ||||
-rw-r--r-- | awk.h | 1 | ||||
-rw-r--r-- | dfa.c | 316 | ||||
-rw-r--r-- | dfa.h | 22 | ||||
-rw-r--r-- | localeinfo.c | 113 | ||||
-rw-r--r-- | localeinfo.h | 54 | ||||
-rw-r--r-- | node.c | 2 | ||||
-rw-r--r-- | re.c | 17 | ||||
-rw-r--r-- | verify.h | 279 |
11 files changed, 622 insertions, 216 deletions
@@ -1,4 +1,20 @@ -2016-08-29 Aharon Robbins <aharon.robbins@intel.com> +2016-09-01 Arnold D. Robbins <arnold@skeeve.com> + + Merge grep's now thread-safe dfa. Wheee. + + * dfa.h, dfa.c: Sync with grep. + * localeinfo.h, localeinfo.c, verify.h: New files. + * Makefile.am (base_sources): Adjust. + * awk.h (using_utf8): Declare new function. + * node.c (str2wstr): Use using_utf8 instead of now-gone dfa function. + * re.c: Include "localeinfo.h". + (localeinfo): New static variable. + (make_regexp): Adjust call to dfa_syntax. + (resetup): Call init_localeinfo on localeinfo. Remove call to + now-gone function dfa_init. + (using_utf8): New function. + +2016-08-29 Arnold D. Robbins <arnold@skeeve.com> * configure.ac (fwrite_unlocked): Check for it. * awk.h (fwrite): Define to fwrite_unlocked if we have it. diff --git a/Makefile.am b/Makefile.am index dce65018..9acae0bc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -109,6 +109,8 @@ base_sources = \ gettext.h \ int_array.c \ interpret.h \ + localeinfo.c \ + localeinfo.h \ io.c \ mbsupport.h \ main.c \ @@ -126,6 +128,7 @@ base_sources = \ replace.c \ str_array.c \ symbol.c \ + verify.h \ version.c \ xalloc.h diff --git a/Makefile.in b/Makefile.in index 036361cb..f103a420 100644 --- a/Makefile.in +++ b/Makefile.in @@ -143,10 +143,11 @@ am__objects_1 = array.$(OBJEXT) awkgram.$(OBJEXT) builtin.$(OBJEXT) \ dfa.$(OBJEXT) eval.$(OBJEXT) ext.$(OBJEXT) field.$(OBJEXT) \ floatcomp.$(OBJEXT) gawkapi.$(OBJEXT) gawkmisc.$(OBJEXT) \ getopt.$(OBJEXT) getopt1.$(OBJEXT) int_array.$(OBJEXT) \ - io.$(OBJEXT) main.$(OBJEXT) mpfr.$(OBJEXT) msg.$(OBJEXT) \ - node.$(OBJEXT) profile.$(OBJEXT) random.$(OBJEXT) re.$(OBJEXT) \ - regex.$(OBJEXT) replace.$(OBJEXT) str_array.$(OBJEXT) \ - symbol.$(OBJEXT) version.$(OBJEXT) + localeinfo.$(OBJEXT) io.$(OBJEXT) main.$(OBJEXT) \ + mpfr.$(OBJEXT) msg.$(OBJEXT) node.$(OBJEXT) profile.$(OBJEXT) \ + random.$(OBJEXT) re.$(OBJEXT) regex.$(OBJEXT) \ + replace.$(OBJEXT) str_array.$(OBJEXT) symbol.$(OBJEXT) \ + version.$(OBJEXT) am_gawk_OBJECTS = $(am__objects_1) gawk_OBJECTS = $(am_gawk_OBJECTS) gawk_LDADD = $(LDADD) @@ -518,6 +519,8 @@ base_sources = \ gettext.h \ int_array.c \ interpret.h \ + localeinfo.c \ + localeinfo.h \ io.c \ mbsupport.h \ main.c \ @@ -535,6 +538,7 @@ base_sources = \ replace.c \ str_array.c \ symbol.c \ + verify.h \ version.c \ xalloc.h @@ -681,6 +685,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getopt1.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/int_array.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/io.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/localeinfo.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mpfr.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/msg.Po@am__quote@ @@ -1657,6 +1657,7 @@ extern void resyntax(int syntax); extern void resetup(void); extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf); extern int get_numbase(const char *str, bool use_locale); +extern bool using_utf8(void); /* symbol.c */ extern void load_symbols(); @@ -69,6 +69,8 @@ #include "dfa.h" +#include "localeinfo.h" + #ifdef GAWK static int is_blank (int c) @@ -445,14 +447,9 @@ struct dfa size_t nregexps; /* Count of parallel regexps being built with dfaparse. */ bool fast; /* The DFA is fast. */ - bool multibyte; /* MB_CUR_MAX > 1. */ token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ mbstate_t mbs; /* Multibyte conversion state. */ - /* dfaexec implementation. */ - char *(*dfaexec) (struct dfa *, char const *, char *, - bool, size_t *, bool *); - /* The following are valid only if MB_CUR_MAX > 1. */ /* The value of multibyte_prop[i] is defined by following rule. @@ -538,6 +535,21 @@ struct dfa state_num **mb_trans; /* Transition tables for states with ANYCHAR. */ state_num mb_trcount; /* Number of transition tables for states with ANYCHAR that have actually been built. */ + + /* Information derived from the locale. This is at the end so that + a quick memset need not clear it specially. */ + + /* dfaexec implementation. */ + char *(*dfaexec) (struct dfa *, char const *, char *, + bool, size_t *, bool *); + + /* The locale is simple, like the C locale. These locales can be + processed more efficiently, e.g., the relationship between lower- + and upper-case letters is 1-1. */ + bool simple_locale; + + /* Other cached information derived from the locale. */ + struct localeinfo localeinfo; }; /* Some macros for user access to dfa internals. */ @@ -551,13 +563,8 @@ struct dfa static void regexp (struct dfa *dfa); -/* A table indexed by byte values that contains the corresponding wide - character (if any) for that byte. WEOF means the byte is not a - valid single-byte character. */ -static wint_t mbrtowc_cache[NOTCHAR]; - /* Store into *PWC the result of converting the leading bytes of the - multibyte buffer S of length N bytes, using the mbrtowc_cache in *D + multibyte buffer S of length N bytes, using D->localeinfo.sbctowc and updating the conversion state in *D. On conversion error, convert just a single byte, to WEOF. Return the number of bytes converted. @@ -566,7 +573,7 @@ static wint_t mbrtowc_cache[NOTCHAR]; * PWC points to wint_t, not to wchar_t. * The last arg is a dfa *D instead of merely a multibyte conversion - state D->mbs. D also contains an mbrtowc_cache for speed. + state D->mbs. * N must be at least 1. * S[N - 1] must be a sentinel byte. * Shift encodings are not supported. @@ -577,7 +584,7 @@ static size_t mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) { unsigned char uc = s[0]; - wint_t wc = mbrtowc_cache[uc]; + wint_t wc = d->localeinfo.sbctowc[uc]; if (wc == WEOF) { @@ -754,7 +761,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) /* In DFA D, find the index of charclass S, or allocate a new one. */ static size_t -dfa_charclass_index (struct dfa *d, charclass const s) +charclass_index (struct dfa *d, charclass const s) { size_t i; @@ -769,9 +776,9 @@ dfa_charclass_index (struct dfa *d, charclass const s) } static bool -unibyte_word_constituent (unsigned char c) +unibyte_word_constituent (struct dfa const *dfa, unsigned char c) { - return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_'); + return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_'); } static int @@ -779,68 +786,11 @@ char_context (struct dfa const *dfa, unsigned char c) { if (c == dfa->syntax.eolbyte) return CTX_NEWLINE; - if (unibyte_word_constituent (c)) + if (unibyte_word_constituent (dfa, c)) return CTX_LETTER; return CTX_NONE; } -/* UTF-8 encoding allows some optimizations that we can't otherwise - assume in a multibyte encoding. */ -static bool using_utf8; - -bool -dfa_using_utf8 (void) -{ - return using_utf8; -} - -static void -init_mbrtowc_cache (void) -{ - int i; - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - char c = i; - unsigned char uc = i; - mbstate_t s = { 0 }; - wchar_t wc; - mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; - } -} - -/* Entry point to set syntax options. */ -void -dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol) -{ - int i; - dfa->syntax.syntax_bits_set = true; - dfa->syntax.syntax_bits = bits; - dfa->syntax.case_fold = fold; - dfa->syntax.eolbyte = eol; - - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - unsigned char uc = i; - - /* Use mbrtowc_cache to calculate sbit. */ - dfa->syntax.sbit[uc] = char_context (dfa, uc); - switch (dfa->syntax.sbit[uc]) - { - case CTX_LETTER: - setbit (uc, dfa->syntax.letters); - break; - case CTX_NEWLINE: - setbit (uc, dfa->syntax.newline); - break; - } - - /* POSIX requires that the five bytes in "\n\r./" (including the - terminating NUL) cannot occur inside a multibyte character. */ - dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80 - : strchr ("\n\r./", uc) != NULL); - } -} - /* Set a bit in the charclass for the given wchar_t. Do nothing if WC is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, this may happen when folding case in weird Turkish locales where @@ -869,30 +819,10 @@ setbit_case_fold_c (int b, charclass c) setbit (i, c); } -static void check_utf8 (void) -{ - wchar_t wc; - mbstate_t mbs = { 0 }; - using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; -} - -static bool unibyte_c; - -static void check_unibyte_c (void) -{ - char const *locale = setlocale (LC_ALL, NULL); - unibyte_c = (!locale - || STREQ (locale, "C") - || STREQ (locale, "POSIX")); -} - -/* The current locale is known to be a unibyte locale - without multicharacter collating sequences and where range - comparisons simply use the native encoding. These locales can be - processed more efficiently. */ +/* Return true if the locale compatible with the C locale. */ static bool -using_simple_locale (struct dfa const *dfa) +using_simple_locale (bool multibyte) { /* The native character set is known to be compatible with the C locale. The following test isn't perfect, but it's good @@ -910,7 +840,15 @@ using_simple_locale (struct dfa const *dfa) && '}' == 125 && '~' == 126) }; - return (native_c_charset & !dfa->multibyte) | unibyte_c; + if (native_c_charset && !multibyte) + return true; + else + { + /* Treat C and POSIX locales as being compatible. Also, treat + errors as compatible, as these are invariably from stubs. */ + char const *loc = setlocale (LC_ALL, NULL); + return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX"); + } } /* Fetch the next lexical input character. Set C (of type int) to the @@ -946,53 +884,6 @@ using_simple_locale (struct dfa const *dfa) # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif -/* The set of wchar_t values C such that there's a useful locale - somewhere where C != towupper (C) && C != towlower (towupper (C)). - For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because - towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and - towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ -static short const lonesome_lower[] = - { - 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, - 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, - - /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase - counterpart in locales predating Unicode 4.0.0 (April 2003). */ - 0x03F2, - - 0x03F5, 0x1E9B, 0x1FBE, - }; - -/* Maximum number of characters that can be the case-folded - counterparts of a single character, not counting the character - itself. This is 1 for towupper, 1 for towlower, and 1 for each - entry in LONESOME_LOWER. */ -enum -{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower }; - -/* Find the characters equal to C after case-folding, other than C - itself, and store them into FOLDED. Return the number of characters - stored. */ -static unsigned int -case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) -{ - unsigned int i; - unsigned int n = 0; - wint_t uc = towupper (c); - wint_t lc = towlower (uc); - if (uc != c) - folded[n++] = uc; - if (lc != uc && lc != c && towupper (lc) == uc) - folded[n++] = lc; - for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) - { - wint_t li = lonesome_lower[i]; - if (li != lc && li != uc && li != c && towupper (li) == uc) - folded[n++] = li; - } - return n; -} - typedef int predicate (int); /* The following list maps the names of the Posix named character classes @@ -1061,7 +952,7 @@ parse_bracket_exp (struct dfa *dfa) size_t chars_al; chars_al = 0; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets, &dfa->mbcsets_alloc, @@ -1084,7 +975,7 @@ parse_bracket_exp (struct dfa *dfa) { FETCH_WC (dfa, c, wc, _("unbalanced [")); invert = true; - known_bracket_exp = using_simple_locale (dfa); + known_bracket_exp = dfa->simple_locale; } else invert = false; @@ -1139,7 +1030,7 @@ parse_bracket_exp (struct dfa *dfa) if (!pred) dfaerror (_("invalid character class")); - if (dfa->multibyte && !pred->single_byte_only) + if (dfa->localeinfo.multibyte && !pred->single_byte_only) known_bracket_exp = false; else for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1199,9 +1090,9 @@ parse_bracket_exp (struct dfa *dfa) /* Treat [x-y] as a range if x != y. */ if (wc != wc2 || wc == WEOF) { - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) known_bracket_exp = false; - else if (using_simple_locale (dfa)) + else if (dfa->simple_locale) { int ci; for (ci = c; ci <= c2; ci++) @@ -1228,7 +1119,7 @@ parse_bracket_exp (struct dfa *dfa) colon_warning_state |= (c == ':') ? 2 : 4; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { if (dfa->syntax.case_fold) setbit_case_fold_c (c, ccl); @@ -1265,22 +1156,22 @@ parse_bracket_exp (struct dfa *dfa) if (! known_bracket_exp) return BACKREF; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { work_mbc->invert = invert; - work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl); + work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl); return MBCSET; } if (invert) { - assert (!dfa->multibyte); + assert (!dfa->localeinfo.multibyte); notset (ccl); if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit ('\n', ccl); } - return CSET + dfa_charclass_index (dfa, ccl); + return CSET + charclass_index (dfa, ccl); } struct lexptr @@ -1535,7 +1426,7 @@ lex (struct dfa *dfa) case '.': if (backslash) goto normal_char; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { /* In multibyte environment period must match with a single character not a byte. So we use ANYCHAR. */ @@ -1549,13 +1440,13 @@ lex (struct dfa *dfa) if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); case 's': case 'S': if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1564,7 +1455,7 @@ lex (struct dfa *dfa) if (c == 'S') notset (ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1588,16 +1479,16 @@ lex (struct dfa *dfa) if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) - if (unibyte_word_constituent (c2)) + if (unibyte_word_constituent (dfa, c2)) setbit (c2, ccl); if (c == 'W') notset (ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1627,14 +1518,14 @@ lex (struct dfa *dfa) dfa->lex.laststart = false; /* For multibyte character sets, folding is done in atom. Always return WCHAR. */ - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) return dfa->lex.lasttok = WCHAR; if (dfa->syntax.case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } return dfa->lex.lasttok = c; @@ -1654,11 +1545,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop) { dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc, sizeof *dfa->tokens); - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc, sizeof *dfa->multibyte_prop); } - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop[dfa->tindex] = mbprop; dfa->tokens[dfa->tindex++] = t; @@ -1695,7 +1586,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc); static void addtok (struct dfa *dfa, token t) { - if (dfa->multibyte && t == MBCSET) + if (dfa->localeinfo.multibyte && t == MBCSET) { bool need_or = false; struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1]; @@ -1794,7 +1685,7 @@ add_utf8_anychar (struct dfa *dfa) if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', c); } - dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c); + dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c); } /* A valid UTF-8 character is @@ -1878,7 +1769,7 @@ atom (struct dfa *dfa) dfa->parse.tok = lex (dfa); } - else if (dfa->parse.tok == ANYCHAR && using_utf8) + else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1939,7 +1830,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens) { size_t i; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) for (i = 0; i < ntokens; ++i) addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]); else @@ -2025,7 +1916,7 @@ dfaparse (char const *s, size_t len, struct dfa *d) d->lex.lasttok = END; d->lex.laststart = true; d->lex.parens = 0; - if (d->multibyte) + if (d->localeinfo.multibyte) { d->lex.cur_mb_len = 0; memset (&d->mbs, 0, sizeof d->mbs); @@ -2214,7 +2105,7 @@ state_index (struct dfa *d, position_set const *s, int context) } else if (d->tokens[s->elems[j].index] == BACKREF) constraint = NO_CONSTRAINT; - if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR) + if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR) { int acceptable = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE) @@ -2691,7 +2582,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->multibyte && d->tokens[pos.index] == ANYCHAR) + else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR) { /* ANYCHAR must match a single character, so put it to D->states[s].mbps which contains the positions which can @@ -2837,7 +2728,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) state_letter = state; for (i = 0; i < NOTCHAR; ++i) - trans[i] = unibyte_word_constituent (i) ? state_letter : state; + trans[i] = unibyte_word_constituent (d, i) ? state_letter : state; trans[d->syntax.eolbyte] = state_newline; } else @@ -2854,7 +2745,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k) insert (d->follows[grps[i].elems[j]].elems[k], &follows); - if (d->multibyte) + if (d->localeinfo.multibyte) { /* If a token in follows.elems is not 1st byte of a multibyte character, or the states of follows must accept the bytes @@ -2887,7 +2778,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* If we are building a searching matcher, throw in the positions of state 0 as well. */ - if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte)) + if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte)) { merge (&d->states[0].elems, &follows, &tmp); copy (&tmp, &follows); @@ -2943,7 +2834,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) if (c == d->syntax.eolbyte) trans[c] = state_newline; - else if (unibyte_word_constituent (c)) + else if (unibyte_word_constituent (d, c)) trans[c] = state_letter; else if (c < NOTCHAR) trans[c] = state; @@ -2984,7 +2875,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state) d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails); d->success = xnrealloc (d->success, newalloc, sizeof *d->success); d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines); - if (d->multibyte) + if (d->localeinfo.multibyte) { realtrans = d->mb_trans ? d->mb_trans - 1 : NULL; realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans); @@ -2996,7 +2887,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state) { d->trans[oldalloc] = NULL; d->fails[oldalloc] = NULL; - if (d->multibyte) + if (d->localeinfo.multibyte) d->mb_trans[oldalloc] = NULL; } } @@ -3030,7 +2921,7 @@ build_state (state_num s, struct dfa *d) } d->trcount = d->min_trcount; - if (d->multibyte) + if (d->localeinfo.multibyte) { for (i = d->min_trcount; i < d->tralloc; i++) { @@ -3481,7 +3372,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end, return (char *) begin; } -/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte), +/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte), but faster and set *BACKREF if the DFA code does not support this regexp usage. */ @@ -3539,7 +3430,7 @@ dfa_supported (struct dfa const *d) case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (!d->multibyte) + if (!d->localeinfo.multibyte) continue; /* fallthrough */ @@ -3557,7 +3448,7 @@ dfaoptimize (struct dfa *d) size_t i; bool have_backref = false; - if (!using_utf8) + if (!d->localeinfo.using_utf8) return; for (i = 0; i < d->tindex; ++i) @@ -3587,7 +3478,7 @@ dfaoptimize (struct dfa *d) } free_mbdata (d); - d->multibyte = false; + d->localeinfo.multibyte = false; d->dfaexec = dfaexec_sb; d->fast = true; } @@ -3602,7 +3493,7 @@ dfassbuild (struct dfa *d) struct dfa *sup = dfaalloc (); *sup = *d; - sup->multibyte = false; + sup->localeinfo.multibyte = false; sup->dfaexec = dfaexec_sb; sup->multibyte_prop = NULL; sup->mbcsets = NULL; @@ -3635,7 +3526,7 @@ dfassbuild (struct dfa *d) case BACKREF: zeroset (ccl); notset (ccl); - sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl); + sup->tokens[j++] = CSET + charclass_index (sup, ccl); sup->tokens[j++] = STAR; if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR || d->tokens[i + 1] == PLUS) @@ -3646,7 +3537,7 @@ dfassbuild (struct dfa *d) case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (d->multibyte) + if (d->localeinfo.multibyte) { /* These constraints aren't supported in a multibyte locale. Ignore them in the superset DFA. */ @@ -3663,7 +3554,7 @@ dfassbuild (struct dfa *d) } sup->tindex = j; - if (have_nchar && (have_achar || d->multibyte)) + if (have_nchar && (have_achar || d->localeinfo.multibyte)) d->superset = sup; else { @@ -3705,7 +3596,7 @@ dfafree (struct dfa *d) free (d->charclasses); free (d->tokens); - if (d->multibyte) + if (d->localeinfo.multibyte) free_mbdata (d); for (i = 0; i < d->sindex; ++i) @@ -4227,20 +4118,49 @@ dfamustfree (struct dfamust *dm) struct dfa * dfaalloc (void) { - struct dfa *d = xzalloc (sizeof *d); - d->multibyte = MB_CUR_MAX > 1; - d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; - d->fast = !d->multibyte; - d->lex.cur_mb_len = 1; - return d; + return xmalloc (sizeof (struct dfa)); } +/* Initialize DFA. */ void -dfa_init (void) +dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, + reg_syntax_t bits, bool fold, unsigned char eol) { - check_utf8 (); - check_unibyte_c (); - init_mbrtowc_cache (); + int i; + memset (dfa, 0, offsetof (struct dfa, dfaexec)); + dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb; + dfa->simple_locale = using_simple_locale (linfo->multibyte); + dfa->localeinfo = *linfo; + + dfa->fast = !dfa->localeinfo.multibyte; + + dfa->lex.cur_mb_len = 1; + dfa->syntax.syntax_bits_set = true; + dfa->syntax.syntax_bits = bits; + dfa->syntax.case_fold = fold; + dfa->syntax.eolbyte = eol; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + unsigned char uc = i; + + dfa->syntax.sbit[uc] = char_context (dfa, uc); + switch (dfa->syntax.sbit[uc]) + { + case CTX_LETTER: + setbit (uc, dfa->syntax.letters); + break; + case CTX_NEWLINE: + setbit (uc, dfa->syntax.newline); + break; + } + + /* POSIX requires that the five bytes in "\n\r./" (including the + terminating NUL) cannot occur inside a multibyte character. */ + dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8 + ? (uc & 0xc0) != 0x80 + : strchr ("\n\r./", uc) != NULL); + } } /* vim:set shiftwidth=2: */ @@ -28,6 +28,8 @@ #define _GL_ATTRIBUTE_MALLOC +struct localeinfo; /* See localeinfo.h. */ + /* Element of a list of strings, at least one of which is known to appear in any R.E. matching the DFA. */ struct dfamust @@ -48,17 +50,22 @@ struct dfa; calling dfafree() on it. */ extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC; +/* Initialize or reinitialize a DFA. This must be called before + any of the routines below. The arguments are: + 1. The DFA to operate on. + 2. Information about the current locale. + 3. The syntax bits described earlier in this file. + 4. The case-folding flag. + 5. The line terminator. */ +extern void dfasyntax (struct dfa *, struct localeinfo const *, + reg_syntax_t, bool, unsigned char); + /* Build and return the struct dfamust from the given struct dfa. */ extern struct dfamust *dfamust (struct dfa const *); /* Free the storage held by the components of a struct dfamust. */ extern void dfamustfree (struct dfamust *); -/* dfasyntax() takes four arguments; the first is the dfa to operate on, the - second sets the syntax bits described earlier in this file, the third sets - the case-folding flag, and the fourth specifies the line terminator. */ -extern void dfasyntax (struct dfa *, reg_syntax_t, bool, unsigned char); - /* Compile the given string of the given length into the given struct dfa. Final argument is a flag specifying whether to build a searching or an exact matcher. */ @@ -103,8 +110,3 @@ extern void dfawarn (const char *); takes a single argument, a NUL-terminated string describing the error. The user must supply a dfaerror. */ extern _Noreturn void dfaerror (const char *); - -extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE; - -/* This must be called before calling any of the above dfa*() functions. */ -extern void dfa_init (void); diff --git a/localeinfo.c b/localeinfo.c new file mode 100644 index 00000000..ca96afc7 --- /dev/null +++ b/localeinfo.c @@ -0,0 +1,113 @@ +/* locale information + + Copyright 2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written by Paul Eggert. */ + +#include <config.h> + +#include <localeinfo.h> + +#include <verify.h> + +#include <limits.h> +#include <locale.h> +#include <stdlib.h> +#include <string.h> +#include <wctype.h> + +/* The sbclen implementation relies on this. */ +verify (MB_LEN_MAX <= SCHAR_MAX); + +/* Return true if the locale uses UTF-8. */ + +static bool +is_using_utf8 (void) +{ + wchar_t wc; + mbstate_t mbs = {0}; + return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; +} + +/* Initialize *LOCALEINFO from the current locale. */ + +void +init_localeinfo (struct localeinfo *localeinfo) +{ + int i; + + localeinfo->multibyte = MB_CUR_MAX > 1; + localeinfo->using_utf8 = is_using_utf8 (); + + for (i = CHAR_MIN; i <= CHAR_MAX; i++) + { + char c = i; + unsigned char uc = i; + mbstate_t s = {0}; + wchar_t wc; + size_t len = mbrtowc (&wc, &c, 1, &s); + localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len; + localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF; + } +} + +/* The set of wchar_t values C such that there's a useful locale + somewhere where C != towupper (C) && C != towlower (towupper (C)). + For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because + towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and + towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ +static short const lonesome_lower[] = + { + 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, + 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, + + /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase + counterpart in locales predating Unicode 4.0.0 (April 2003). */ + 0x03F2, + + 0x03F5, 0x1E9B, 0x1FBE, + }; + +/* Verify that the worst case fits. This is 1 for towupper, 1 for + towlower, and 1 for each entry in LONESOME_LOWER. */ +verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower + <= CASE_FOLDED_BUFSIZE); + +/* Find the characters equal to C after case-folding, other than C + itself, and store them into FOLDED. Return the number of characters + stored. */ + +int +case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) +{ + int i; + int n = 0; + wint_t uc = towupper (c); + wint_t lc = towlower (uc); + if (uc != c) + folded[n++] = uc; + if (lc != uc && lc != c && towupper (lc) == uc) + folded[n++] = lc; + for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) + { + wint_t li = lonesome_lower[i]; + if (li != lc && li != uc && li != c && towupper (li) == uc) + folded[n++] = li; + } + return n; +} diff --git a/localeinfo.h b/localeinfo.h new file mode 100644 index 00000000..cf2f9a69 --- /dev/null +++ b/localeinfo.h @@ -0,0 +1,54 @@ +/* locale information + + Copyright 2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written by Paul Eggert. */ + +#include <limits.h> +#include <stdbool.h> +#include <wchar.h> + +struct localeinfo +{ + /* MB_CUR_MAX > 1. */ + bool multibyte; + + /* The locale uses UTF-8. */ + bool using_utf8; + + /* An array indexed by byte values B that contains 1 if B is a + single-byte character, -1 if B is an encoding error, and -2 if B + is the leading byte of a multibyte character that contains more + than one byte. */ + signed char sbclen[UCHAR_MAX + 1]; + + /* An array indexed by byte values B that contains the corresponding + wide character (if any) for B if sbclen[B] == 1. WEOF means the + byte is not a valid single-byte character, i.e., sbclen[B] == -1 + or -2. */ + wint_t sbctowc[UCHAR_MAX + 1]; +}; + +extern void init_localeinfo (struct localeinfo *); + +/* Maximum number of characters that can be the case-folded + counterparts of a single character, not counting the character + itself. This is a generous upper bound. */ +enum { CASE_FOLDED_BUFSIZE = 32 }; + +extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]); @@ -752,7 +752,7 @@ str2wstr(NODE *n, size_t **ptr) * stopping early. This is particularly important * for match() where we need to build the indices. */ - if (dfa_using_utf8()) { + if (using_utf8()) { count = 1; wc = 0xFFFD; /* unicode replacement character */ goto set_wc; @@ -25,10 +25,14 @@ #include "awk.h" +#include "localeinfo.h" + static reg_syntax_t syn; static void check_bracket_exp(char *s, size_t len); const char *regexflags2str(int flags); +static struct localeinfo localeinfo; + /* make_regexp --- generate compiled regular expressions */ Regexp * @@ -223,7 +227,7 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) rp->pat.newline_anchor = false; /* don't get \n in middle of string */ if (dfa && ! no_dfa) { rp->dfareg = dfaalloc(); - dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n'); + dfasyntax(rp->dfareg, & localeinfo, dfa_syn, ignorecase, '\n'); dfacomp(buf, len, rp->dfareg, true); } else rp->dfareg = NULL; @@ -395,6 +399,9 @@ re_update(NODE *t) void resetup() { + // init localeinfo for dfa + init_localeinfo(& localeinfo); + /* * Syntax bits: _that_ is yet another mind trip. Recreational drugs * are helpful for recovering from the experience. @@ -418,8 +425,14 @@ resetup() syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES; (void) re_set_syntax(syn); +} + +/* using_utf8 --- are we using utf8 */ - dfa_init(); +bool +using_utf8(void) +{ + return localeinfo.using_utf8; } /* reisstring --- return true if the RE match is a simple string match */ diff --git a/verify.h b/verify.h new file mode 100644 index 00000000..5c8381d2 --- /dev/null +++ b/verify.h @@ -0,0 +1,279 @@ +/* Compile-time assert-like macros. + + Copyright (C) 2005-2006, 2009-2016 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Written by Paul Eggert, Bruno Haible, and Jim Meyering. */ + +#ifndef _GL_VERIFY_H +#define _GL_VERIFY_H + + +/* Define _GL_HAVE__STATIC_ASSERT to 1 if _Static_assert works as per C11. + This is supported by GCC 4.6.0 and later, in C mode, and its use + here generates easier-to-read diagnostics when verify (R) fails. + + Define _GL_HAVE_STATIC_ASSERT to 1 if static_assert works as per C++11. + This will likely be supported by future GCC versions, in C++ mode. + + Use this only with GCC. If we were willing to slow 'configure' + down we could also use it with other compilers, but since this + affects only the quality of diagnostics, why bother? */ +#if (4 < __GNUC__ + (6 <= __GNUC_MINOR__) \ + && (201112L <= __STDC_VERSION__ || !defined __STRICT_ANSI__) \ + && !defined __cplusplus) +# define _GL_HAVE__STATIC_ASSERT 1 +#endif +/* The condition (99 < __GNUC__) is temporary, until we know about the + first G++ release that supports static_assert. */ +#if (99 < __GNUC__) && defined __cplusplus +# define _GL_HAVE_STATIC_ASSERT 1 +#endif + +/* FreeBSD 9.1 <sys/cdefs.h>, included by <stddef.h> and lots of other + system headers, defines a conflicting _Static_assert that is no + better than ours; override it. */ +#ifndef _GL_HAVE_STATIC_ASSERT +# include <stddef.h> +# undef _Static_assert +#endif + +/* Each of these macros verifies that its argument R is nonzero. To + be portable, R should be an integer constant expression. Unlike + assert (R), there is no run-time overhead. + + If _Static_assert works, verify (R) uses it directly. Similarly, + _GL_VERIFY_TRUE works by packaging a _Static_assert inside a struct + that is an operand of sizeof. + + The code below uses several ideas for C++ compilers, and for C + compilers that do not support _Static_assert: + + * The first step is ((R) ? 1 : -1). Given an expression R, of + integral or boolean or floating-point type, this yields an + expression of integral type, whose value is later verified to be + constant and nonnegative. + + * Next this expression W is wrapped in a type + struct _gl_verify_type { + unsigned int _gl_verify_error_if_negative: W; + }. + If W is negative, this yields a compile-time error. No compiler can + deal with a bit-field of negative size. + + One might think that an array size check would have the same + effect, that is, that the type struct { unsigned int dummy[W]; } + would work as well. However, inside a function, some compilers + (such as C++ compilers and GNU C) allow local parameters and + variables inside array size expressions. With these compilers, + an array size check would not properly diagnose this misuse of + the verify macro: + + void function (int n) { verify (n < 0); } + + * For the verify macro, the struct _gl_verify_type will need to + somehow be embedded into a declaration. To be portable, this + declaration must declare an object, a constant, a function, or a + typedef name. If the declared entity uses the type directly, + such as in + + struct dummy {...}; + typedef struct {...} dummy; + extern struct {...} *dummy; + extern void dummy (struct {...} *); + extern struct {...} *dummy (void); + + two uses of the verify macro would yield colliding declarations + if the entity names are not disambiguated. A workaround is to + attach the current line number to the entity name: + + #define _GL_CONCAT0(x, y) x##y + #define _GL_CONCAT(x, y) _GL_CONCAT0 (x, y) + extern struct {...} * _GL_CONCAT (dummy, __LINE__); + + But this has the problem that two invocations of verify from + within the same macro would collide, since the __LINE__ value + would be the same for both invocations. (The GCC __COUNTER__ + macro solves this problem, but is not portable.) + + A solution is to use the sizeof operator. It yields a number, + getting rid of the identity of the type. Declarations like + + extern int dummy [sizeof (struct {...})]; + extern void dummy (int [sizeof (struct {...})]); + extern int (*dummy (void)) [sizeof (struct {...})]; + + can be repeated. + + * Should the implementation use a named struct or an unnamed struct? + Which of the following alternatives can be used? + + extern int dummy [sizeof (struct {...})]; + extern int dummy [sizeof (struct _gl_verify_type {...})]; + extern void dummy (int [sizeof (struct {...})]); + extern void dummy (int [sizeof (struct _gl_verify_type {...})]); + extern int (*dummy (void)) [sizeof (struct {...})]; + extern int (*dummy (void)) [sizeof (struct _gl_verify_type {...})]; + + In the second and sixth case, the struct type is exported to the + outer scope; two such declarations therefore collide. GCC warns + about the first, third, and fourth cases. So the only remaining + possibility is the fifth case: + + extern int (*dummy (void)) [sizeof (struct {...})]; + + * GCC warns about duplicate declarations of the dummy function if + -Wredundant-decls is used. GCC 4.3 and later have a builtin + __COUNTER__ macro that can let us generate unique identifiers for + each dummy function, to suppress this warning. + + * This implementation exploits the fact that older versions of GCC, + which do not support _Static_assert, also do not warn about the + last declaration mentioned above. + + * GCC warns if -Wnested-externs is enabled and verify() is used + within a function body; but inside a function, you can always + arrange to use verify_expr() instead. + + * In C++, any struct definition inside sizeof is invalid. + Use a template type to work around the problem. */ + +/* Concatenate two preprocessor tokens. */ +#define _GL_CONCAT(x, y) _GL_CONCAT0 (x, y) +#define _GL_CONCAT0(x, y) x##y + +/* _GL_COUNTER is an integer, preferably one that changes each time we + use it. Use __COUNTER__ if it works, falling back on __LINE__ + otherwise. __LINE__ isn't perfect, but it's better than a + constant. */ +#if defined __COUNTER__ && __COUNTER__ != __COUNTER__ +# define _GL_COUNTER __COUNTER__ +#else +# define _GL_COUNTER __LINE__ +#endif + +/* Generate a symbol with the given prefix, making it unique if + possible. */ +#define _GL_GENSYM(prefix) _GL_CONCAT (prefix, _GL_COUNTER) + +/* Verify requirement R at compile-time, as an integer constant expression + that returns 1. If R is false, fail at compile-time, preferably + with a diagnostic that includes the string-literal DIAGNOSTIC. */ + +#define _GL_VERIFY_TRUE(R, DIAGNOSTIC) \ + (!!sizeof (_GL_VERIFY_TYPE (R, DIAGNOSTIC))) + +#ifdef __cplusplus +# if !GNULIB_defined_struct__gl_verify_type +template <int w> + struct _gl_verify_type { + unsigned int _gl_verify_error_if_negative: w; + }; +# define GNULIB_defined_struct__gl_verify_type 1 +# endif +# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \ + _gl_verify_type<(R) ? 1 : -1> +#elif defined _GL_HAVE__STATIC_ASSERT +# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \ + struct { \ + _Static_assert (R, DIAGNOSTIC); \ + int _gl_dummy; \ + } +#else +# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \ + struct { unsigned int _gl_verify_error_if_negative: (R) ? 1 : -1; } +#endif + +/* Verify requirement R at compile-time, as a declaration without a + trailing ';'. If R is false, fail at compile-time, preferably + with a diagnostic that includes the string-literal DIAGNOSTIC. + + Unfortunately, unlike C11, this implementation must appear as an + ordinary declaration, and cannot appear inside struct { ... }. */ + +#ifdef _GL_HAVE__STATIC_ASSERT +# define _GL_VERIFY _Static_assert +#else +# define _GL_VERIFY(R, DIAGNOSTIC) \ + extern int (*_GL_GENSYM (_gl_verify_function) (void)) \ + [_GL_VERIFY_TRUE (R, DIAGNOSTIC)] +#endif + +/* _GL_STATIC_ASSERT_H is defined if this code is copied into assert.h. */ +#ifdef _GL_STATIC_ASSERT_H +# if !defined _GL_HAVE__STATIC_ASSERT && !defined _Static_assert +# define _Static_assert(R, DIAGNOSTIC) _GL_VERIFY (R, DIAGNOSTIC) +# endif +# if !defined _GL_HAVE_STATIC_ASSERT && !defined static_assert +# define static_assert _Static_assert /* C11 requires this #define. */ +# endif +#endif + +/* @assert.h omit start@ */ + +/* Each of these macros verifies that its argument R is nonzero. To + be portable, R should be an integer constant expression. Unlike + assert (R), there is no run-time overhead. + + There are two macros, since no single macro can be used in all + contexts in C. verify_true (R) is for scalar contexts, including + integer constant expression contexts. verify (R) is for declaration + contexts, e.g., the top level. */ + +/* Verify requirement R at compile-time, as an integer constant expression. + Return 1. This is equivalent to verify_expr (R, 1). + + verify_true is obsolescent; please use verify_expr instead. */ + +#define verify_true(R) _GL_VERIFY_TRUE (R, "verify_true (" #R ")") + +/* Verify requirement R at compile-time. Return the value of the + expression E. */ + +#define verify_expr(R, E) \ + (_GL_VERIFY_TRUE (R, "verify_expr (" #R ", " #E ")") ? (E) : (E)) + +/* Verify requirement R at compile-time, as a declaration without a + trailing ';'. */ + +#define verify(R) _GL_VERIFY (R, "verify (" #R ")") + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +/* Assume that R always holds. This lets the compiler optimize + accordingly. R should not have side-effects; it may or may not be + evaluated. Behavior is undefined if R is false. */ + +#if (__has_builtin (__builtin_unreachable) \ + || 4 < __GNUC__ + (5 <= __GNUC_MINOR__)) +# define assume(R) ((R) ? (void) 0 : __builtin_unreachable ()) +#elif 1200 <= _MSC_VER +# define assume(R) __assume (R) +#elif ((defined GCC_LINT || defined lint) \ + && (__has_builtin (__builtin_trap) \ + || 3 < __GNUC__ + (3 < __GNUC_MINOR__ + (4 <= __GNUC_PATCHLEVEL__)))) + /* Doing it this way helps various packages when configured with + --enable-gcc-warnings, which compiles with -Dlint. It's nicer + when 'assume' silences warnings even with older GCCs. */ +# define assume(R) ((R) ? (void) 0 : __builtin_trap ()) +#else +# define assume(R) ((void) (0 && (R))) +#endif + +/* @assert.h omit end@ */ + +#endif |