aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c537
1 files changed, 309 insertions, 228 deletions
diff --git a/dfa.c b/dfa.c
index 19ca737f..378305df 100644
--- a/dfa.c
+++ b/dfa.c
@@ -43,8 +43,15 @@
#include "missing_d/gawkbool.h"
#endif /* HAVE_STDBOOL_H */
-#include "dfa.h"
-
+/* Gawk doesn't use Gnulib, so don't assume that setlocale and
+ static_assert are present. */
+#ifndef LC_ALL
+# define setlocale(category, locale) NULL
+#endif
+#ifndef static_assert
+# define static_assert(cond, diagnostic) \
+ extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
+#endif
#define STREQ(a, b) (strcmp (a, b) == 0)
@@ -84,6 +91,8 @@
#include "xalloc.h"
+#include "dfa.h"
+
#ifdef GAWK
static int
is_blank (int c)
@@ -130,7 +139,7 @@ typedef unsigned int charclass[CHARCLASS_INTS];
/* Convert a possibly-signed character to an unsigned character. This is
a bit safer than casting to unsigned char, since it catches some type
errors that the cast doesn't. */
-static inline unsigned char
+static unsigned char
to_uchar (char ch)
{
return ch;
@@ -219,7 +228,8 @@ enum
EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
the empty string. */
- BACKREF, /* BACKREF is generated by \<digit>; it
+ BACKREF, /* BACKREF is generated by \<digit>
+ or by any other construct that
is not completely handled. If the scanner
detects a transition on backref, it returns
a kind of "semi-success" indicating that
@@ -402,6 +412,14 @@ struct dfa
size_t nmultibyte_prop;
int *multibyte_prop;
+#if MBS_SUPPORT
+ /* A table indexed by byte values that contains the corresponding wide
+ character (if any) for that byte. WEOF means the byte is the
+ leading byte of a multibyte character. Invalid and null bytes are
+ mapped to themselves. */
+ wint_t mbrtowc_cache[NOTCHAR];
+#endif
+
/* Array of the bracket expression in the DFA. */
struct mb_char_classes *mbcsets;
size_t nmbcsets;
@@ -504,6 +522,64 @@ static void regexp (void);
} \
while (false)
+static void
+dfambcache (struct dfa *d)
+{
+#if MBS_SUPPORT
+ int i;
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ char c = i;
+ unsigned char uc = i;
+ mbstate_t s = { 0 };
+ wchar_t wc;
+ wint_t wi;
+ switch (mbrtowc (&wc, &c, 1, &s))
+ {
+ default: wi = wc; break;
+ case (size_t) -2: wi = WEOF; break;
+ case (size_t) -1: wi = uc; break;
+ }
+ d->mbrtowc_cache[uc] = wi;
+ }
+#endif
+}
+
+#if MBS_SUPPORT
+/* Given the dfa D, store into *PWC the result of converting the
+ leading bytes of the multibyte buffer S of length N bytes, updating
+ the conversion state in *MBS. On conversion error, convert just a
+ single byte as-is. Return the number of bytes converted.
+
+ This differs from mbrtowc (PWC, S, N, MBS) as follows:
+
+ * Extra arg D, containing an mbrtowc_cache for speed.
+ * N must be at least 1.
+ * S[N - 1] must be a sentinel byte.
+ * Shift encodings are not supported.
+ * The return value is always in the range 1..N.
+ * *MBS is always valid afterwards.
+ * *PWC is always set to something. */
+static size_t
+mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n,
+ mbstate_t *mbs)
+{
+ unsigned char uc = s[0];
+ wint_t wc = d->mbrtowc_cache[uc];
+
+ if (wc == WEOF)
+ {
+ size_t nbytes = mbrtowc (pwc, s, n, mbs);
+ if (0 < nbytes && nbytes < (size_t) -2)
+ return nbytes;
+ memset (mbs, 0, sizeof *mbs);
+ wc = uc;
+ }
+
+ *pwc = wc;
+ return 1;
+}
+#endif
#ifdef DEBUG
@@ -731,67 +807,39 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
this may happen when folding case in weird Turkish locales where
dotless i/dotted I are not included in the chosen character set.
Return whether a bit was set in the charclass. */
-#if MBS_SUPPORT
static bool
setbit_wc (wint_t wc, charclass c)
{
+#if MBS_SUPPORT
int b = wctob (wc);
if (b == EOF)
return false;
setbit (b, c);
return true;
-}
-
-/* Set a bit in the charclass for the given single byte character,
- if it is valid in the current character set. */
-static void
-setbit_c (int b, charclass c)
-{
- /* Do nothing if b is invalid in this character set. */
- if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
- return;
- setbit (b, c);
-}
#else
-# define setbit_c setbit
-static inline bool
-setbit_wc (wint_t wc, charclass c)
-{
abort ();
/*NOTREACHED*/ return false;
-}
#endif
+}
-/* Like setbit_c, but if case is folded, set both cases of a letter. For
- MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
- and the caller takes care of setting the appropriate field of struct
- mb_char_classes. */
+/* Set a bit for B and its case variants in the charclass C.
+ MB_CUR_MAX must be 1. */
static void
setbit_case_fold_c (int b, charclass c)
{
- if (MB_CUR_MAX > 1)
- {
- wint_t wc = btowc (b);
- if (wc == WEOF)
- return;
- setbit (b, c);
- if (case_fold && iswalpha (wc))
- setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
- }
- else
- {
- setbit (b, c);
- if (case_fold && isalpha (b))
- setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
- }
+ int ub = toupper (b);
+ int i;
+ for (i = 0; i < NOTCHAR; i++)
+ if (toupper (i) == ub)
+ setbit (i, c);
}
/* UTF-8 encoding allows some optimizations that we can't otherwise
assume in a multibyte encoding. */
-static inline int
+int
using_utf8 (void)
{
static int utf8 = -1;
@@ -811,6 +859,46 @@ using_utf8 (void)
return utf8;
}
+/* Return true if the current locale is known to be a unibyte locale
+ without multicharacter collating sequences and where range
+ comparisons simply use the native encoding. These locales can be
+ processed more efficiently. */
+
+static bool
+using_simple_locale (void)
+{
+ /* True if the native character set is known to be compatible with
+ the C locale. The following test isn't perfect, but it's good
+ enough in practice, as only ASCII and EBCDIC are in common use
+ and this test correctly accepts ASCII and rejects EBCDIC. */
+ enum { native_c_charset =
+ ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
+ && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
+ && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
+ && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
+ && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
+ && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
+ && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
+ && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
+ && '}' == 125 && '~' == 126)
+ };
+
+ if (! native_c_charset || MB_CUR_MAX > 1)
+ return false;
+ else
+ {
+ static int unibyte_c = -1;
+ if (unibyte_c < 0)
+ {
+ char const *locale = setlocale (LC_ALL, NULL);
+ unibyte_c = (!locale
+ || STREQ (locale, "C")
+ || STREQ (locale, "POSIX"));
+ }
+ return unibyte_c;
+ }
+}
+
/* Lexical analyzer. All the dross that deals with the obnoxious
GNU Regex syntax bits is located here. The poor, suffering
reader is referred to the GNU Regex documentation for the
@@ -827,7 +915,7 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */
static int cur_mb_len = 1; /* Length of the multibyte representation of
wctok. */
/* These variables are used only if (MB_CUR_MAX > 1). */
-static mbstate_t mbs; /* Mbstate for mbrlen. */
+static mbstate_t mbs; /* mbstate for mbrtowc. */
static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.
@@ -864,32 +952,18 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */
else \
{ \
wchar_t _wc; \
- cur_mb_len = mbrtowc (&_wc, lexptr, lexleft, &mbs); \
- if (cur_mb_len <= 0) \
- { \
- cur_mb_len = 1; \
- --lexleft; \
- (wc) = (c) = to_uchar (*lexptr++); \
- } \
- else \
- { \
- lexptr += cur_mb_len; \
- lexleft -= cur_mb_len; \
- (wc) = _wc; \
- (c) = wctob (wc); \
- } \
+ size_t nbytes = mbs_to_wchar (dfa, &_wc, lexptr, lexleft, &mbs); \
+ cur_mb_len = nbytes; \
+ (wc) = _wc; \
+ (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF; \
+ lexptr += nbytes; \
+ lexleft -= nbytes; \
} \
} while (0)
-# define FETCH(c, eoferr) \
- do { \
- wint_t wc; \
- FETCH_WC (c, wc, eoferr); \
- } while (0)
-
#else
/* Note that characters become unsigned here. */
-# define FETCH(c, eoferr) \
+# define FETCH_WC(c, unused, eoferr) \
do { \
if (! lexleft) \
{ \
@@ -902,14 +976,56 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */
--lexleft; \
} while (0)
-# define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
-
#endif /* MBS_SUPPORT */
#ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
+/* The set of wchar_t values C such that there's a useful locale
+ somewhere where C != towupper (C) && C != towlower (towupper (C)).
+ For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+ towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+ towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
+static short const lonesome_lower[] =
+ {
+ 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+ 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+ /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+ counterpart in locales predating Unicode 4.0.0 (April 2003). */
+ 0x03F2,
+
+ 0x03F5, 0x1E9B, 0x1FBE,
+ };
+
+static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2
+ == CASE_FOLDED_BUFSIZE),
+ "CASE_FOLDED_BUFSIZE is wrong");
+
+/* Find the characters equal to C after case-folding, other than C
+ itself, and store them into FOLDED. Return the number of characters
+ stored. */
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+ int i;
+ int n = 0;
+ wint_t uc = towupper (c);
+ wint_t lc = towlower (uc);
+ if (uc != c)
+ folded[n++] = uc;
+ if (lc != uc && lc != c && towupper (lc) == uc)
+ folded[n++] = lc;
+ for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+ {
+ wint_t li = lonesome_lower[i];
+ if (li != lc && li != uc && li != c && towupper (li) == uc)
+ folded[n++] = li;
+ }
+ return n;
+}
+
typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
@@ -928,7 +1044,7 @@ static const struct dfa_ctype prednames[] = {
{"upper", isupper, false},
{"lower", islower, false},
{"digit", isdigit, true},
- {"xdigit", isxdigit, true},
+ {"xdigit", isxdigit, false},
{"space", isspace, false},
{"punct", ispunct, false},
{"alnum", isalnum, false},
@@ -959,6 +1075,10 @@ parse_bracket_exp (void)
int c, c1, c2;
charclass ccl;
+ /* True if this is a bracket expression that dfaexec is known to
+ process correctly. */
+ bool known_bracket_exp = true;
+
/* Used to warn about [:space:].
Bit 0 = first character is a colon.
Bit 1 = last character is a colon.
@@ -1000,6 +1120,7 @@ parse_bracket_exp (void)
{
FETCH_WC (c, wc, _("unbalanced ["));
invert = 1;
+ known_bracket_exp = using_simple_locale ();
}
else
invert = 0;
@@ -1014,16 +1135,14 @@ parse_bracket_exp (void)
we just treat it as a bunch of ordinary characters. We can do
this because we assume regex has checked for syntax errors before
dfa is ever called. */
- if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
+ if (c == '[')
{
#define MAX_BRACKET_STRING_LEN 32
char str[MAX_BRACKET_STRING_LEN + 1];
FETCH_WC (c1, wc1, _("unbalanced ["));
- /* If pattern contains '[[:', '[[.', or '[[='. */
- if (c1 == ':'
- /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1. */
- || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
+ if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES))
+ || c1 == '.' || c1 == '=')
{
size_t len = 0;
for (;;)
@@ -1042,7 +1161,10 @@ parse_bracket_exp (void)
/* Fetch bracket. */
FETCH_WC (c, wc, _("unbalanced ["));
if (c1 == ':')
- /* build character class. */
+ /* Build character class. POSIX allows character
+ classes to match multicharacter collating elements,
+ but the regex code does not support that, so do not
+ worry about that possibility. */
{
char const *class
= (case_fold && (STREQ (str, "upper")
@@ -1064,30 +1186,11 @@ parse_bracket_exp (void)
for (c2 = 0; c2 < NOTCHAR; ++c2)
if (pred->func (c2))
- setbit_case_fold_c (c2, ccl);
+ setbit (c2, ccl);
}
+ else
+ known_bracket_exp = false;
- else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
- {
- char *elem = xmemdup (str, len + 1);
-
- if (c1 == '=')
- /* build equivalence class. */
- {
- REALLOC_IF_NECESSARY (work_mbc->equivs,
- equivs_al, work_mbc->nequivs + 1);
- work_mbc->equivs[work_mbc->nequivs++] = elem;
- }
-
- if (c1 == '.')
- /* build collating element. */
- {
- REALLOC_IF_NECESSARY (work_mbc->coll_elems,
- coll_elems_al,
- work_mbc->ncoll_elems + 1);
- work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
- }
- }
colon_warning_state |= 8;
/* Fetch new lookahead character. */
@@ -1109,105 +1212,96 @@ parse_bracket_exp (void)
/* build range characters. */
{
FETCH_WC (c2, wc2, _("unbalanced ["));
- if (c2 == ']')
+
+ /* A bracket expression like [a-[.aa.]] matches an unknown set.
+ Treat it like [-a[.aa.]] while parsing it, and
+ remember that the set is unknown. */
+ if (c2 == '[' && *lexptr == '.')
{
- /* In the case [x-], the - is an ordinary hyphen,
- which is left in c1, the lookahead character. */
- lexptr -= cur_mb_len;
- lexleft += cur_mb_len;
+ known_bracket_exp = false;
+ c2 = ']';
}
- }
- if (c1 == '-' && c2 != ']')
- {
- if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- FETCH_WC (c2, wc2, _("unbalanced ["));
-
- if (MB_CUR_MAX > 1)
+ if (c2 != ']')
{
- /* When case folding map a range, say [m-z] (or even [M-z])
- to the pair of ranges, [m-z] [M-Z]. */
- REALLOC_IF_NECESSARY (work_mbc->range_sts,
- range_sts_al, work_mbc->nranges + 1);
- REALLOC_IF_NECESSARY (work_mbc->range_ends,
- range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] =
- case_fold ? towlower (wc) : (wchar_t) wc;
- work_mbc->range_ends[work_mbc->nranges++] =
- case_fold ? towlower (wc2) : (wchar_t) wc2;
-
- if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+ if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+ FETCH_WC (c2, wc2, _("unbalanced ["));
+
+ if (MB_CUR_MAX > 1)
{
+ /* When case folding map a range, say [m-z] (or even [M-z])
+ to the pair of ranges, [m-z] [M-Z]. Although this code
+ is wrong in multiple ways, it's never used in practice.
+ FIXME: Remove this (and related) unused code. */
REALLOC_IF_NECESSARY (work_mbc->range_sts,
range_sts_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
REALLOC_IF_NECESSARY (work_mbc->range_ends,
range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
- }
- }
- else
- {
-#ifdef GAWK
- c1 = c;
- if (case_fold)
- {
- c1 = tolower (c1);
- c2 = tolower (c2);
+ work_mbc->range_sts[work_mbc->nranges] =
+ case_fold ? towlower (wc) : (wchar_t) wc;
+ work_mbc->range_ends[work_mbc->nranges++] =
+ case_fold ? towlower (wc2) : (wchar_t) wc2;
+
+ if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+ {
+ REALLOC_IF_NECESSARY (work_mbc->range_sts,
+ range_sts_al, work_mbc->nranges + 1);
+ work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
+ REALLOC_IF_NECESSARY (work_mbc->range_ends,
+ range_ends_al, work_mbc->nranges + 1);
+ work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+ }
}
- for (c = c1; c <= c2; c++)
- setbit_case_fold_c (c, ccl);
-#else
- /* Defer to the system regex library about the meaning
- of range expressions. */
- regex_t re;
- char pattern[6] = { '[', 0, '-', 0, ']', 0 };
- char subject[2] = { 0, 0 };
- c1 = c;
- if (case_fold)
+ else if (using_simple_locale ())
{
- c1 = tolower (c1);
- c2 = tolower (c2);
+ for (c1 = c; c1 <= c2; c1++)
+ setbit (c1, ccl);
+ if (case_fold)
+ {
+ int uc = toupper (c);
+ int uc2 = toupper (c2);
+ for (c1 = 0; c1 < NOTCHAR; c1++)
+ {
+ int uc1 = toupper (c1);
+ if (uc <= uc1 && uc1 <= uc2)
+ setbit (c1, ccl);
+ }
+ }
}
+ else
+ known_bracket_exp = false;
- pattern[1] = c1;
- pattern[3] = c2;
- regcomp (&re, pattern, REG_NOSUB);
- for (c = 0; c < NOTCHAR; ++c)
- {
- if ((case_fold && isupper (c)))
- continue;
- subject[0] = c;
- if (regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
- setbit_case_fold_c (c, ccl);
- }
- regfree (&re);
-#endif
+ colon_warning_state |= 8;
+ FETCH_WC (c1, wc1, _("unbalanced ["));
+ continue;
}
- colon_warning_state |= 8;
- FETCH_WC (c1, wc1, _("unbalanced ["));
- continue;
+ /* In the case [x-], the - is an ordinary hyphen,
+ which is left in c1, the lookahead character. */
+ lexptr -= cur_mb_len;
+ lexleft += cur_mb_len;
}
colon_warning_state |= (c == ':') ? 2 : 4;
if (MB_CUR_MAX == 1)
{
- setbit_case_fold_c (c, ccl);
+ if (case_fold)
+ setbit_case_fold_c (c, ccl);
+ else
+ setbit (c, ccl);
continue;
}
- if (case_fold && iswalpha (wc))
+ if (case_fold)
{
- wc = towlower (wc);
- if (!setbit_wc (wc, ccl))
- {
- REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = wc;
- }
- wc = towupper (wc);
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wc, folded);
+ REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+ work_mbc->nchars + n);
+ for (i = 0; i < n; i++)
+ if (!setbit_wc (folded[i], ccl))
+ work_mbc->chars[work_mbc->nchars++] = folded[i];
}
if (!setbit_wc (wc, ccl))
{
@@ -1221,6 +1315,9 @@ parse_bracket_exp (void)
if (colon_warning_state == 7)
dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
+ if (! known_bracket_exp)
+ return BACKREF;
+
if (MB_CUR_MAX > 1)
{
static charclass zeroclass;
@@ -1256,14 +1353,9 @@ lex (void)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- if (MB_CUR_MAX > 1)
- {
- FETCH_WC (c, wctok, NULL);
- if ((int) c == EOF)
- goto normal_char;
- }
- else
- FETCH (c, NULL);
+ FETCH_WC (c, wctok, NULL);
+ if (c == (unsigned int) EOF)
+ goto normal_char;
switch (c)
{
@@ -1638,10 +1730,11 @@ addtok (token t)
work_mbc->nchars = 0;
}
- /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
+ /* If the MBCSET is non-inverted and doesn't include neither
+ character classes including multibyte characters, range
+ expressions, equivalence classes nor collating elements,
+ it can be replaced to a simple CSET. */
if (work_mbc->invert
- || (!using_utf8 () && work_mbc->cset != -1)
- || work_mbc->nchars != 0
|| work_mbc->nch_classes != 0
|| work_mbc->nranges != 0
|| work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0)
@@ -1656,7 +1749,6 @@ addtok (token t)
that the mbcset is empty now. Do nothing in that case. */
if (work_mbc->cset != -1)
{
- assert (using_utf8 ());
addtok (CSET + work_mbc->cset);
if (need_or)
addtok (OR);
@@ -1680,16 +1772,19 @@ static void
addtok_wc (wint_t wc)
{
unsigned char buf[MB_LEN_MAX];
- mbstate_t s;
+ mbstate_t s = { 0 };
int i;
- memset (&s, 0, sizeof s);
- cur_mb_len = wcrtomb ((char *) buf, wc, &s);
+ size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
- /* This is merely stop-gap. When cur_mb_len is 0 or negative,
- buf[0] is undefined, yet skipping the addtok_mb call altogether
- can result in heap corruption. */
- if (cur_mb_len <= 0)
- buf[0] = 0;
+ if (stored_bytes != (size_t) -1)
+ cur_mb_len = stored_bytes;
+ else
+ {
+ /* This is merely stop-gap. buf[0] is undefined, yet skipping
+ the addtok_mb call altogether can corrupt the heap. */
+ cur_mb_len = 1;
+ buf[0] = 0;
+ }
addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
for (i = 1; i < cur_mb_len; i++)
@@ -1794,17 +1889,19 @@ add_utf8_anychar (void)
static void
atom (void)
{
- if (0)
- {
- /* empty */
- }
- else if (MBS_SUPPORT && tok == WCHAR)
+ if (MBS_SUPPORT && tok == WCHAR)
{
- addtok_wc (case_fold ? towlower (wctok) : wctok);
- if (case_fold && iswalpha (wctok))
+ addtok_wc (wctok);
+
+ if (case_fold)
{
- addtok_wc (towupper (wctok));
- addtok (OR);
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wctok, folded);
+ for (i = 0; i < n; i++)
+ {
+ addtok_wc (folded[i]);
+ addtok (OR);
+ }
}
tok = lex ();
@@ -3308,43 +3405,26 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
/* Initialize mblen_buf and inputwcs with data from the next line. */
static void
-prepare_wc_buf (const char *begin, const char *end)
+prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
{
#if MBS_SUPPORT
unsigned char eol = eolbyte;
- size_t remain_bytes, i;
+ size_t i;
+ size_t ilim = end - begin + 1;
buf_begin = (unsigned char *) begin;
- remain_bytes = 0;
- for (i = 0; i < end - begin + 1; i++)
+ for (i = 0; i < ilim; i++)
{
- if (remain_bytes == 0)
- {
- remain_bytes
- = mbrtowc (inputwcs + i, begin + i, end - begin - i + 1, &mbs);
- if (remain_bytes < 1
- || remain_bytes == (size_t) -1
- || remain_bytes == (size_t) -2
- || (remain_bytes == 1 && inputwcs[i] == (wchar_t) begin[i]))
- {
- remain_bytes = 0;
- inputwcs[i] = (wchar_t) begin[i];
- mblen_buf[i] = 0;
- if (begin[i] == eol)
- break;
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- remain_bytes--;
- }
- }
- else
+ size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i, &mbs);
+ mblen_buf[i] = nbytes - (nbytes == 1);
+ if (begin[i] == eol)
+ break;
+ while (--nbytes != 0)
{
- mblen_buf[i] = remain_bytes;
+ i++;
+ mblen_buf[i] = nbytes;
inputwcs[i] = 0;
- remain_bytes--;
}
}
@@ -3391,7 +3471,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
MALLOC (mblen_buf, end - begin + 2);
MALLOC (inputwcs, end - begin + 2);
memset (&mbs, 0, sizeof (mbstate_t));
- prepare_wc_buf ((const char *) p, end);
+ prepare_wc_buf (d, (const char *) p, end);
}
for (;;)
@@ -3481,7 +3561,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
++*count;
if (d->mb_cur_max > 1)
- prepare_wc_buf ((const char *) p, end);
+ prepare_wc_buf (d, (const char *) p, end);
}
/* Check if we've run off the end of the buffer. */
@@ -3600,6 +3680,7 @@ void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
{
dfainit (d);
+ dfambcache (d);
dfaparse (s, len, d);
dfamust (d);
dfaoptimize (d);