aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c1425
1 files changed, 677 insertions, 748 deletions
diff --git a/dfa.c b/dfa.c
index 378305df..d306d5c9 100644
--- a/dfa.c
+++ b/dfa.c
@@ -37,21 +37,11 @@
#if HAVE_SETLOCALE
#include <locale.h>
#endif
-#ifdef HAVE_STDBOOL_H
-#include <stdbool.h>
-#else
-#include "missing_d/gawkbool.h"
-#endif /* HAVE_STDBOOL_H */
-/* Gawk doesn't use Gnulib, so don't assume that setlocale and
- static_assert are present. */
+/* Gawk doesn't use Gnulib, so don't assume that setlocale is present. */
#ifndef LC_ALL
# define setlocale(category, locale) NULL
#endif
-#ifndef static_assert
-# define static_assert(cond, diagnostic) \
- extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
-#endif
#define STREQ(a, b) (strcmp (a, b) == 0)
@@ -85,10 +75,6 @@
#endif
#endif /* GAWK */
-#if HAVE_LANGINFO_CODESET
-# include <langinfo.h>
-#endif
-
#include "xalloc.h"
#include "dfa.h"
@@ -338,7 +324,8 @@ typedef struct
size_t hash; /* Hash of the positions of this state. */
position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */
- char backref; /* True if this state matches a \<digit>. */
+ bool has_backref; /* True if this state matches a \<digit>. */
+ bool has_mbcset; /* True if this state matches a MBCSET. */
unsigned short constraint; /* Constraint for this state to accept. */
token first_end; /* Token value of the first END in elems. */
position_set mbps; /* Positions which can match multibyte
@@ -355,13 +342,16 @@ typedef ptrdiff_t state_num;
struct mb_char_classes
{
ptrdiff_t cset;
- int invert;
+ bool invert;
wchar_t *chars; /* Normal characters. */
size_t nchars;
wctype_t *ch_classes; /* Character classes. */
size_t nch_classes;
- wchar_t *range_sts; /* Range characters (start of the range). */
- wchar_t *range_ends; /* Range characters (end of the range). */
+ struct /* Range characters. */
+ {
+ wchar_t beg; /* Range start. */
+ wchar_t end; /* Range end. */
+ } *ranges;
size_t nranges;
char **equivs; /* Equivalence classes. */
size_t nequivs;
@@ -387,10 +377,11 @@ struct dfa
size_t nleaves; /* Number of leaves on the parse tree. */
size_t nregexps; /* Count of parallel regexps being built
with dfaparse. */
- unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
+ bool multibyte; /* True iff MB_CUR_MAX > 1. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
+ mbstate_t mbs; /* Multibyte conversion state. */
- /* The following are used only if MB_CUR_MAX > 1. */
+ /* The following are valid only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule.
if tokens[i] < NOTCHAR
@@ -409,7 +400,6 @@ struct dfa
multibyte_prop
= 3 , 1 , 0 , 2 , 3
*/
- size_t nmultibyte_prop;
int *multibyte_prop;
#if MBS_SUPPORT
@@ -425,10 +415,13 @@ struct dfa
size_t nmbcsets;
size_t mbcsets_alloc;
+ /* Fields filled by the superset. */
+ struct dfa *superset; /* Hint of the dfa. */
+
/* Fields filled by the state builder. */
dfa_state *states; /* States of the dfa. */
state_num sindex; /* Index for adding new states. */
- state_num salloc; /* Number of states currently allocated. */
+ size_t salloc; /* Number of states currently allocated. */
/* Fields filled by the parse tree->NFA conversion. */
position_set *follows; /* Array of follow sets, indexed by position
@@ -438,7 +431,7 @@ struct dfa
matching the given position in a string
matching the regexp. Allocated to the
maximum possible position index. */
- int searchflag; /* True if we are supposed to build a searching
+ bool searchflag; /* True if we are supposed to build a searching
as opposed to an exact matcher. A searching
matcher finds the first and shortest string
matching a regexp anywhere in the buffer,
@@ -448,16 +441,16 @@ struct dfa
/* Fields filled by dfaexec. */
state_num tralloc; /* Number of transition tables that have
- slots so far. */
+ slots so far, not counting trans[-1]. */
int trcount; /* Number of transition tables that have
actually been built. */
state_num **trans; /* Transition tables for states that can
never accept. If the transitions for a
state have not yet been computed, or the
state could possibly accept, its entry in
- this table is NULL. */
- state_num **realtrans; /* Trans always points to realtrans + 1; this
- is so trans[-1] can contain NULL. */
+ this table is NULL. This points to one
+ past the start of the allocated array,
+ and trans[-1] is always NULL. */
state_num **fails; /* Transition tables after failing to accept
on a state that potentially could do so. */
int *success; /* Table of acceptance conditions used in
@@ -472,6 +465,11 @@ struct dfa
struct dfamust *musts; /* List of strings, at least one of which
is known to appear in any r.e. matching
the dfa. */
+ position_set mb_follows; /* Follow set added by ANYCHAR and/or MBCSET
+ on demand. */
+ int *mb_match_lens; /* Array of length reduced by ANYCHAR and/or
+ MBCSET. Null if mb_follows.elems has not
+ been allocated. */
};
/* Some macros for user access to dfa internals. */
@@ -487,41 +485,6 @@ struct dfa
static void dfamust (struct dfa *dfa);
static void regexp (void);
-/* These two macros are identical to the ones in gnulib's xalloc.h,
- except that they do not cast the result to "(t *)", and thus may
- be used via type-free CALLOC and MALLOC macros. */
-#undef XNMALLOC
-#undef XCALLOC
-
-/* Allocate memory for N elements of type T, with error checking. */
-/* extern t *XNMALLOC (size_t n, typename t); */
-# define XNMALLOC(n, t) \
- (sizeof (t) == 1 ? xmalloc (n) : xnmalloc (n, sizeof (t)))
-
-/* Allocate memory for N elements of type T, with error checking,
- and zero it. */
-/* extern t *XCALLOC (size_t n, typename t); */
-# define XCALLOC(n, t) \
- (sizeof (t) == 1 ? xzalloc (n) : xcalloc (n, sizeof (t)))
-
-#define CALLOC(p, n) do { (p) = XCALLOC (n, *(p)); } while (0)
-#undef MALLOC /* Irix defines this */
-#define MALLOC(p, n) do { (p) = XNMALLOC (n, *(p)); } while (0)
-#define REALLOC(p, n) do {(p) = xnrealloc (p, n, sizeof (*(p))); } while (0)
-
-/* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */
-#define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \
- do \
- { \
- if ((n_alloc) <= (n_required)) \
- { \
- size_t new_n_alloc = (n_required) + !(p); \
- (p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p))); \
- (n_alloc) = new_n_alloc; \
- } \
- } \
- while (false)
-
static void
dfambcache (struct dfa *d)
{
@@ -546,33 +509,34 @@ dfambcache (struct dfa *d)
}
#if MBS_SUPPORT
-/* Given the dfa D, store into *PWC the result of converting the
- leading bytes of the multibyte buffer S of length N bytes, updating
- the conversion state in *MBS. On conversion error, convert just a
- single byte as-is. Return the number of bytes converted.
+/* Store into *PWC the result of converting the leading bytes of the
+ multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+ and updating the conversion state in *D. On conversion error,
+ convert just a single byte as-is. Return the number of bytes
+ converted.
- This differs from mbrtowc (PWC, S, N, MBS) as follows:
+ This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
- * Extra arg D, containing an mbrtowc_cache for speed.
+ * The last arg is a dfa *D instead of merely a multibyte conversion
+ state D->mbs. D also contains an mbrtowc_cache for speed.
* N must be at least 1.
* S[N - 1] must be a sentinel byte.
* Shift encodings are not supported.
* The return value is always in the range 1..N.
- * *MBS is always valid afterwards.
+ * D->mbs is always valid afterwards.
* *PWC is always set to something. */
static size_t
-mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n,
- mbstate_t *mbs)
+mbs_to_wchar (wchar_t *pwc, char const *s, size_t n, struct dfa *d)
{
unsigned char uc = s[0];
wint_t wc = d->mbrtowc_cache[uc];
if (wc == WEOF)
{
- size_t nbytes = mbrtowc (pwc, s, n, mbs);
+ size_t nbytes = mbrtowc (pwc, s, n, &d->mbs);
if (0 < nbytes && nbytes < (size_t) -2)
return nbytes;
- memset (mbs, 0, sizeof *mbs);
+ memset (&d->mbs, 0, sizeof d->mbs);
wc = uc;
}
@@ -700,36 +664,60 @@ notset (charclass s)
s[i] = ~s[i];
}
-static int
+static bool
equal (charclass const s1, charclass const s2)
{
return memcmp (s1, s2, sizeof (charclass)) == 0;
}
-/* A pointer to the current dfa is kept here during parsing. */
-static struct dfa *dfa;
+/* Ensure that the array addressed by PTR holds at least NITEMS +
+ (PTR || !NITEMS) items. Either return PTR, or reallocate the array
+ and return its new address. Although PTR may be null, the returned
+ value is never null.
+
+ The array holds *NALLOC items; *NALLOC must be zero if PTR is null,
+ and is updated on reallocation. ITEMSIZE is the size of one item.
+ Avoid O(N**2) behavior on arrays growing linearly. */
+static void *
+maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
+{
+ if (nitems < *nalloc)
+ return ptr;
+ *nalloc = nitems;
+ return x2nrealloc (ptr, nalloc, itemsize);
+}
-/* Find the index of charclass s in dfa->charclasses, or allocate a
- new charclass. */
+/* In DFA D, find the index of charclass S, or allocate a new one. */
static size_t
-charclass_index (charclass const s)
+dfa_charclass_index (struct dfa *d, charclass const s)
{
size_t i;
- for (i = 0; i < dfa->cindex; ++i)
- if (equal (s, dfa->charclasses[i]))
+ for (i = 0; i < d->cindex; ++i)
+ if (equal (s, d->charclasses[i]))
return i;
- REALLOC_IF_NECESSARY (dfa->charclasses, dfa->calloc, dfa->cindex + 1);
- ++dfa->cindex;
- copyset (s, dfa->charclasses[i]);
+ d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
+ sizeof *d->charclasses);
+ ++d->cindex;
+ copyset (s, d->charclasses[i]);
return i;
}
+/* A pointer to the current dfa is kept here during parsing. */
+static struct dfa *dfa;
+
+/* Find the index of charclass S in the current DFA, or allocate a new one. */
+static size_t
+charclass_index (charclass const s)
+{
+ return dfa_charclass_index (dfa, s);
+}
+
/* Syntax bits controlling the behavior of the lexical analyzer. */
static reg_syntax_t syntax_bits, syntax_bits_set;
/* Flag for case-folding letters into sets. */
-static int case_fold;
+static bool case_fold;
/* End-of-line byte in data. */
static unsigned char eolbyte;
@@ -759,7 +747,7 @@ static charclass newline;
static int
char_context (unsigned char c)
{
- if (c == eolbyte || c == 0)
+ if (c == eolbyte)
return CTX_NEWLINE;
if (IS_WORD_CONSTITUENT (c))
return CTX_LETTER;
@@ -784,7 +772,7 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
syntax_bits_set = 1;
syntax_bits = bits;
- case_fold = fold;
+ case_fold = fold != 0;
eolbyte = eol;
for (i = 0; i < NOTCHAR; ++i)
@@ -843,19 +831,16 @@ int
using_utf8 (void)
{
static int utf8 = -1;
- if (utf8 == -1)
+ if (utf8 < 0)
{
-#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
- utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
-#else
- utf8 = 0;
-#endif
+ wchar_t wc;
+ mbstate_t mbs = { 0 };
+ utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
#ifdef LIBC_IS_BORKED
if (gawk_mb_cur_max == 1)
utf8 = 0;
#endif
}
-
return utf8;
}
@@ -907,7 +892,7 @@ using_simple_locale (void)
static char const *lexptr; /* Pointer to next input character. */
static size_t lexleft; /* Number of characters remaining. */
static token lasttok; /* Previous token returned; initially END. */
-static int laststart; /* True if we're separated from beginning or (,
+static bool laststart; /* True if we're separated from beginning or (,
| only by zero-width characters. */
static size_t parens; /* Count of outstanding left parens. */
static int minrep, maxrep; /* Repeat counts for {m,n}. */
@@ -915,27 +900,8 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */
static int cur_mb_len = 1; /* Length of the multibyte representation of
wctok. */
/* These variables are used only if (MB_CUR_MAX > 1). */
-static mbstate_t mbs; /* mbstate for mbrtowc. */
static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
-static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.
- Each element stores the number of remaining
- bytes of the corresponding multibyte
- character in the input string. A element's
- value is 0 if the corresponding character is
- single-byte.
- e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
- mblen_buf : 0, 3, 2, 1
- */
-static wchar_t *inputwcs; /* Wide character representation of the input
- string in dfaexec.
- The length of this array is the same as
- the length of input string (char array).
- inputstring[i] is a single-byte char,
- or the first byte of a multibyte char;
- inputwcs[i] is the codepoint. */
-static unsigned char const *buf_begin; /* reference to begin in dfaexec. */
-static unsigned char const *buf_end; /* reference to end in dfaexec. */
#if MBS_SUPPORT
@@ -952,7 +918,7 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */
else \
{ \
wchar_t _wc; \
- size_t nbytes = mbs_to_wchar (dfa, &_wc, lexptr, lexleft, &mbs); \
+ size_t nbytes = mbs_to_wchar (&_wc, lexptr, lexleft, dfa); \
cur_mb_len = nbytes; \
(wc) = _wc; \
(c) = nbytes == 1 ? to_uchar (*lexptr) : EOF; \
@@ -999,14 +965,17 @@ static short const lonesome_lower[] =
0x03F5, 0x1E9B, 0x1FBE,
};
-static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2
- == CASE_FOLDED_BUFSIZE),
- "CASE_FOLDED_BUFSIZE is wrong");
+/* Maximum number of characters that can be the case-folded
+ counterparts of a single character, not counting the character
+ itself. This is 1 for towupper, 1 for towlower, and 1 for each
+ entry in LONESOME_LOWER. */
+enum
+{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
/* Find the characters equal to C after case-folding, other than C
itself, and store them into FOLDED. Return the number of characters
stored. */
-int
+static int
case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
{
int i;
@@ -1071,7 +1040,7 @@ find_pred (const char *str)
static token
parse_bracket_exp (void)
{
- int invert;
+ bool invert;
int c, c1, c2;
charclass ccl;
@@ -1092,16 +1061,14 @@ parse_bracket_exp (void)
/* Work area to build a mb_char_classes. */
struct mb_char_classes *work_mbc;
- size_t chars_al, range_sts_al, range_ends_al, ch_classes_al,
- equivs_al, coll_elems_al;
+ size_t chars_al, ranges_al, ch_classes_al, equivs_al, coll_elems_al;
- chars_al = 0;
- range_sts_al = range_ends_al = 0;
- ch_classes_al = equivs_al = coll_elems_al = 0;
- if (MB_CUR_MAX > 1)
+ chars_al = ranges_al = ch_classes_al = equivs_al = coll_elems_al = 0;
+ if (dfa->multibyte)
{
- REALLOC_IF_NECESSARY (dfa->mbcsets, dfa->mbcsets_alloc,
- dfa->nmbcsets + 1);
+ dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
+ &dfa->mbcsets_alloc,
+ sizeof *dfa->mbcsets);
/* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
We will update dfa->multibyte_prop[] in addtok, because we can't
@@ -1119,11 +1086,11 @@ parse_bracket_exp (void)
if (c == '^')
{
FETCH_WC (c, wc, _("unbalanced ["));
- invert = 1;
+ invert = true;
known_bracket_exp = using_simple_locale ();
}
else
- invert = 0;
+ invert = false;
colon_warning_state = (c == ':');
do
@@ -1173,14 +1140,15 @@ parse_bracket_exp (void)
if (!pred)
dfaerror (_("invalid character class"));
- if (MB_CUR_MAX > 1 && !pred->single_byte_only)
+ if (dfa->multibyte && !pred->single_byte_only)
{
/* Store the character class as wctype_t. */
wctype_t wt = (wctype_t) wctype (class);
- REALLOC_IF_NECESSARY (work_mbc->ch_classes,
- ch_classes_al,
- work_mbc->nch_classes + 1);
+ work_mbc->ch_classes
+ = maybe_realloc (work_mbc->ch_classes,
+ work_mbc->nch_classes, &ch_classes_al,
+ sizeof *work_mbc->ch_classes);
work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
}
@@ -1227,29 +1195,25 @@ parse_bracket_exp (void)
if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
FETCH_WC (c2, wc2, _("unbalanced ["));
- if (MB_CUR_MAX > 1)
+ if (dfa->multibyte)
{
/* When case folding map a range, say [m-z] (or even [M-z])
to the pair of ranges, [m-z] [M-Z]. Although this code
is wrong in multiple ways, it's never used in practice.
FIXME: Remove this (and related) unused code. */
- REALLOC_IF_NECESSARY (work_mbc->range_sts,
- range_sts_al, work_mbc->nranges + 1);
- REALLOC_IF_NECESSARY (work_mbc->range_ends,
- range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] =
- case_fold ? towlower (wc) : (wchar_t) wc;
- work_mbc->range_ends[work_mbc->nranges++] =
- case_fold ? towlower (wc2) : (wchar_t) wc2;
+ work_mbc->ranges
+ = maybe_realloc (work_mbc->ranges, work_mbc->nranges + 2,
+ &ranges_al, sizeof *work_mbc->ranges);
+ work_mbc->ranges[work_mbc->nranges].beg
+ = case_fold ? towlower (wc) : wc;
+ work_mbc->ranges[work_mbc->nranges++].end
+ = case_fold ? towlower (wc2) : wc2;
if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
{
- REALLOC_IF_NECESSARY (work_mbc->range_sts,
- range_sts_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
- REALLOC_IF_NECESSARY (work_mbc->range_ends,
- range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+ work_mbc->ranges[work_mbc->nranges].beg = towupper (wc);
+ work_mbc->ranges[work_mbc->nranges++].end
+ = towupper (wc2);
}
}
else if (using_simple_locale ())
@@ -1284,7 +1248,7 @@ parse_bracket_exp (void)
colon_warning_state |= (c == ':') ? 2 : 4;
- if (MB_CUR_MAX == 1)
+ if (!dfa->multibyte)
{
if (case_fold)
setbit_case_fold_c (c, ccl);
@@ -1297,16 +1261,17 @@ parse_bracket_exp (void)
{
wchar_t folded[CASE_FOLDED_BUFSIZE];
int i, n = case_folded_counterparts (wc, folded);
- REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
- work_mbc->nchars + n);
+ work_mbc->chars = maybe_realloc (work_mbc->chars,
+ work_mbc->nchars + n, &chars_al,
+ sizeof *work_mbc->chars);
for (i = 0; i < n; i++)
if (!setbit_wc (folded[i], ccl))
work_mbc->chars[work_mbc->nchars++] = folded[i];
}
if (!setbit_wc (wc, ccl))
{
- REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
- work_mbc->nchars + 1);
+ work_mbc->chars = maybe_realloc (work_mbc->chars, work_mbc->nchars,
+ &chars_al, sizeof *work_mbc->chars);
work_mbc->chars[work_mbc->nchars++] = wc;
}
}
@@ -1318,7 +1283,7 @@ parse_bracket_exp (void)
if (! known_bracket_exp)
return BACKREF;
- if (MB_CUR_MAX > 1)
+ if (dfa->multibyte)
{
static charclass zeroclass;
work_mbc->invert = invert;
@@ -1328,7 +1293,7 @@ parse_bracket_exp (void)
if (invert)
{
- assert (MB_CUR_MAX == 1);
+ assert (!dfa->multibyte);
notset (ccl);
if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit (eolbyte, ccl);
@@ -1341,7 +1306,7 @@ static token
lex (void)
{
unsigned int c, c2;
- int backslash = 0;
+ bool backslash = false;
charclass ccl;
int i;
@@ -1364,7 +1329,7 @@ lex (void)
goto normal_char;
if (lexleft == 0)
dfaerror (_("unfinished \\ escape"));
- backslash = 1;
+ backslash = true;
break;
case '^':
@@ -1402,7 +1367,7 @@ lex (void)
case '9':
if (backslash && !(syntax_bits & RE_NO_BK_REFS))
{
- laststart = 0;
+ laststart = false;
return lasttok = BACKREF;
}
goto normal_char;
@@ -1517,7 +1482,7 @@ lex (void)
lexptr = p;
lexleft = lim - p;
}
- laststart = 0;
+ laststart = false;
return lasttok = REPMN;
case '|':
@@ -1525,21 +1490,21 @@ lex (void)
goto normal_char;
if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
goto normal_char;
- laststart = 1;
+ laststart = true;
return lasttok = OR;
case '\n':
if (syntax_bits & RE_LIMITED_OPS
|| backslash || !(syntax_bits & RE_NEWLINE_ALT))
goto normal_char;
- laststart = 1;
+ laststart = true;
return lasttok = OR;
case '(':
if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
++parens;
- laststart = 1;
+ laststart = true;
return lasttok = LPAREN;
case ')':
@@ -1548,17 +1513,17 @@ lex (void)
if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
goto normal_char;
--parens;
- laststart = 0;
+ laststart = false;
return lasttok = RPAREN;
case '.':
if (backslash)
goto normal_char;
- if (MB_CUR_MAX > 1)
+ if (dfa->multibyte)
{
/* In multibyte environment period must match with a single
character not a byte. So we use ANYCHAR. */
- laststart = 0;
+ laststart = false;
return lasttok = ANYCHAR;
}
zeroset (ccl);
@@ -1567,14 +1532,14 @@ lex (void)
clrbit (eolbyte, ccl);
if (syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', ccl);
- laststart = 0;
+ laststart = false;
return lasttok = CSET + charclass_index (ccl);
case 's':
case 'S':
if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (MB_CUR_MAX == 1)
+ if (!dfa->multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1582,7 +1547,7 @@ lex (void)
setbit (c2, ccl);
if (c == 'S')
notset (ccl);
- laststart = 0;
+ laststart = false;
return lasttok = CSET + charclass_index (ccl);
}
@@ -1612,7 +1577,7 @@ lex (void)
POP_LEX_STATE ();
- laststart = 0;
+ laststart = false;
return lasttok;
case 'w':
@@ -1625,21 +1590,21 @@ lex (void)
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
- laststart = 0;
+ laststart = false;
return lasttok = CSET + charclass_index (ccl);
case '[':
if (backslash)
goto normal_char;
- laststart = 0;
+ laststart = false;
return lasttok = parse_bracket_exp ();
default:
normal_char:
- laststart = 0;
+ laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
- if (MB_CUR_MAX > 1)
+ if (dfa->multibyte)
return lasttok = WCHAR;
if (case_fold && isalpha (c))
@@ -1671,14 +1636,16 @@ static size_t depth; /* Current depth of a hypothetical stack
static void
addtok_mb (token t, int mbprop)
{
- if (MB_CUR_MAX > 1)
+ if (dfa->talloc == dfa->tindex)
{
- REALLOC_IF_NECESSARY (dfa->multibyte_prop, dfa->nmultibyte_prop,
- dfa->tindex + 1);
- dfa->multibyte_prop[dfa->tindex] = mbprop;
+ dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
+ sizeof *dfa->tokens);
+ if (dfa->multibyte)
+ dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
+ sizeof *dfa->multibyte_prop);
}
-
- REALLOC_IF_NECESSARY (dfa->tokens, dfa->talloc, dfa->tindex + 1);
+ if (dfa->multibyte)
+ dfa->multibyte_prop[dfa->tindex] = mbprop;
dfa->tokens[dfa->tindex++] = t;
switch (t)
@@ -1710,7 +1677,7 @@ static void addtok_wc (wint_t wc);
static void
addtok (token t)
{
- if (MB_CUR_MAX > 1 && t == MBCSET)
+ if (dfa->multibyte && t == MBCSET)
{
bool need_or = false;
struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1967,7 +1934,7 @@ copytoks (size_t tindex, size_t ntokens)
{
size_t i;
- if (MB_CUR_MAX > 1)
+ if (dfa->multibyte)
for (i = 0; i < ntokens; ++i)
addtok_mb (dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
else
@@ -2050,12 +2017,12 @@ dfaparse (char const *s, size_t len, struct dfa *d)
lexptr = s;
lexleft = len;
lasttok = END;
- laststart = 1;
+ laststart = true;
parens = 0;
- if (MB_CUR_MAX > 1)
+ if (dfa->multibyte)
{
cur_mb_len = 0;
- memset (&mbs, 0, sizeof mbs);
+ memset (&d->mbs, 0, sizeof d->mbs);
}
if (!syntax_bits_set)
@@ -2080,19 +2047,24 @@ dfaparse (char const *s, size_t len, struct dfa *d)
/* Some primitives for operating on sets of positions. */
-/* Copy one set to another; the destination must be large enough. */
+/* Copy one set to another. */
static void
copy (position_set const *src, position_set * dst)
{
- REALLOC_IF_NECESSARY (dst->elems, dst->alloc, src->nelem);
- memcpy (dst->elems, src->elems, sizeof (dst->elems[0]) * src->nelem);
+ if (dst->alloc < src->nelem)
+ {
+ free (dst->elems);
+ dst->alloc = src->nelem;
+ dst->elems = x2nrealloc (NULL, &dst->alloc, sizeof *dst->elems);
+ }
+ memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
dst->nelem = src->nelem;
}
static void
alloc_position_set (position_set * s, size_t size)
{
- MALLOC (s->elems, size);
+ s->elems = xnmalloc (size, sizeof *s->elems);
s->alloc = size;
s->nelem = 0;
}
@@ -2122,7 +2094,7 @@ insert (position p, position_set * s)
return;
}
- REALLOC_IF_NECESSARY (s->elems, s->alloc, count + 1);
+ s->elems = maybe_realloc (s->elems, count, &s->alloc, sizeof *s->elems);
for (i = count; i > lo; i--)
s->elems[i] = s->elems[i - 1];
s->elems[lo] = p;
@@ -2136,7 +2108,12 @@ merge (position_set const *s1, position_set const *s2, position_set * m)
{
size_t i = 0, j = 0;
- REALLOC_IF_NECESSARY (m->elems, m->alloc, s1->nelem + s2->nelem);
+ if (m->alloc < s1->nelem + s2->nelem)
+ {
+ free (m->elems);
+ m->elems = maybe_realloc (NULL, s1->nelem + s2->nelem, &m->alloc,
+ sizeof *m->elems);
+ }
m->nelem = 0;
while (i < s1->nelem && j < s2->nelem)
if (s1->elems[i].index > s2->elems[j].index)
@@ -2197,19 +2174,19 @@ state_index (struct dfa *d, position_set const *s, int context)
}
/* We'll have to create a new state. */
- REALLOC_IF_NECESSARY (d->states, d->salloc, d->sindex + 1);
+ d->states = maybe_realloc (d->states, d->sindex, &d->salloc,
+ sizeof *d->states);
d->states[i].hash = hash;
alloc_position_set (&d->states[i].elems, s->nelem);
copy (s, &d->states[i].elems);
d->states[i].context = context;
- d->states[i].backref = 0;
+ d->states[i].has_backref = false;
+ d->states[i].has_mbcset = false;
d->states[i].constraint = 0;
d->states[i].first_end = 0;
- if (MBS_SUPPORT)
- {
- d->states[i].mbps.nelem = 0;
- d->states[i].mbps.elems = NULL;
- }
+ d->states[i].mbps.nelem = 0;
+ d->states[i].mbps.elems = NULL;
+
for (j = 0; j < s->nelem; ++j)
if (d->tokens[s->elems[j].index] < 0)
{
@@ -2222,7 +2199,7 @@ state_index (struct dfa *d, position_set const *s, int context)
else if (d->tokens[s->elems[j].index] == BACKREF)
{
d->states[i].constraint = NO_CONSTRAINT;
- d->states[i].backref = 1;
+ d->states[i].has_backref = true;
}
++d->sindex;
@@ -2239,10 +2216,10 @@ static void
epsclosure (position_set * s, struct dfa const *d)
{
size_t i, j;
- char *visited; /* Array of booleans, enough to use char, not int. */
position p, old;
- CALLOC (visited, d->tindex);
+ /* Array of booleans, large enough to use char, not int. */
+ char *visited = xzalloc (d->tindex);
for (i = 0; i < s->nelem; ++i)
if (d->tokens[s->elems[i].index] >= NOTCHAR
@@ -2399,17 +2376,26 @@ state_separate_contexts (position_set const *s)
void
dfaanalyze (struct dfa *d, int searchflag)
{
- int *nullable; /* Nullable stack. */
- size_t *nfirstpos; /* Element count stack for firstpos sets. */
- position *firstpos; /* Array where firstpos elements are stored. */
- size_t *nlastpos; /* Element count stack for lastpos sets. */
- position *lastpos; /* Array where lastpos elements are stored. */
+ /* Array allocated to hold position sets. */
+ position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
+ /* Firstpos and lastpos elements. */
+ position *firstpos = posalloc + d->nleaves;
+ position *lastpos = firstpos + d->nleaves;
+
+ /* Stack for element counts and nullable flags. */
+ struct
+ {
+ /* Whether the entry is nullable. */
+ bool nullable;
+
+ /* Counts of firstpos and lastpos sets. */
+ size_t nfirstpos;
+ size_t nlastpos;
+ } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
+
position_set tmp; /* Temporary set for merging sets. */
position_set merged; /* Result of merging sets. */
int separate_contexts; /* Context wanted by some position. */
- int *o_nullable;
- size_t *o_nfirst, *o_nlast;
- position *o_firstpos, *o_lastpos;
size_t i, j;
position *pos;
@@ -2423,21 +2409,9 @@ dfaanalyze (struct dfa *d, int searchflag)
putc ('\n', stderr);
#endif
- d->searchflag = searchflag;
-
- MALLOC (nullable, d->depth);
- o_nullable = nullable;
- MALLOC (nfirstpos, d->depth);
- o_nfirst = nfirstpos;
- MALLOC (firstpos, d->nleaves);
- o_firstpos = firstpos, firstpos += d->nleaves;
- MALLOC (nlastpos, d->depth);
- o_nlast = nlastpos;
- MALLOC (lastpos, d->nleaves);
- o_lastpos = lastpos, lastpos += d->nleaves;
+ d->searchflag = searchflag != 0;
alloc_position_set (&merged, d->nleaves);
-
- CALLOC (d->follows, d->tindex);
+ d->follows = xcalloc (d->tindex, sizeof *d->follows);
for (i = 0; i < d->tindex; ++i)
{
@@ -2445,20 +2419,21 @@ dfaanalyze (struct dfa *d, int searchflag)
{
case EMPTY:
/* The empty set is nullable. */
- *nullable++ = 1;
+ stk->nullable = true;
/* The firstpos and lastpos of the empty leaf are both empty. */
- *nfirstpos++ = *nlastpos++ = 0;
+ stk->nfirstpos = stk->nlastpos = 0;
+ stk++;
break;
case STAR:
case PLUS:
/* Every element in the firstpos of the argument is in the follow
of every element in the lastpos. */
- tmp.nelem = nfirstpos[-1];
+ tmp.nelem = stk[-1].nfirstpos;
tmp.elems = firstpos;
pos = lastpos;
- for (j = 0; j < nlastpos[-1]; ++j)
+ for (j = 0; j < stk[-1].nlastpos; ++j)
{
merge (&tmp, &d->follows[pos[j].index], &merged);
copy (&merged, &d->follows[pos[j].index]);
@@ -2467,16 +2442,16 @@ dfaanalyze (struct dfa *d, int searchflag)
case QMARK:
/* A QMARK or STAR node is automatically nullable. */
if (d->tokens[i] != PLUS)
- nullable[-1] = 1;
+ stk[-1].nullable = true;
break;
case CAT:
/* Every element in the firstpos of the second argument is in the
follow of every element in the lastpos of the first argument. */
- tmp.nelem = nfirstpos[-1];
+ tmp.nelem = stk[-1].nfirstpos;
tmp.elems = firstpos;
- pos = lastpos + nlastpos[-1];
- for (j = 0; j < nlastpos[-2]; ++j)
+ pos = lastpos + stk[-1].nlastpos;
+ for (j = 0; j < stk[-2].nlastpos; ++j)
{
merge (&tmp, &d->follows[pos[j].index], &merged);
copy (&merged, &d->follows[pos[j].index]);
@@ -2484,43 +2459,39 @@ dfaanalyze (struct dfa *d, int searchflag)
/* The firstpos of a CAT node is the firstpos of the first argument,
union that of the second argument if the first is nullable. */
- if (nullable[-2])
- nfirstpos[-2] += nfirstpos[-1];
+ if (stk[-2].nullable)
+ stk[-2].nfirstpos += stk[-1].nfirstpos;
else
- firstpos += nfirstpos[-1];
- --nfirstpos;
+ firstpos += stk[-1].nfirstpos;
/* The lastpos of a CAT node is the lastpos of the second argument,
union that of the first argument if the second is nullable. */
- if (nullable[-1])
- nlastpos[-2] += nlastpos[-1];
+ if (stk[-1].nullable)
+ stk[-2].nlastpos += stk[-1].nlastpos;
else
{
- pos = lastpos + nlastpos[-2];
- for (j = nlastpos[-1]; j-- > 0;)
+ pos = lastpos + stk[-2].nlastpos;
+ for (j = stk[-1].nlastpos; j-- > 0;)
pos[j] = lastpos[j];
- lastpos += nlastpos[-2];
- nlastpos[-2] = nlastpos[-1];
+ lastpos += stk[-2].nlastpos;
+ stk[-2].nlastpos = stk[-1].nlastpos;
}
- --nlastpos;
/* A CAT node is nullable if both arguments are nullable. */
- nullable[-2] = nullable[-1] && nullable[-2];
- --nullable;
+ stk[-2].nullable &= stk[-1].nullable;
+ stk--;
break;
case OR:
/* The firstpos is the union of the firstpos of each argument. */
- nfirstpos[-2] += nfirstpos[-1];
- --nfirstpos;
+ stk[-2].nfirstpos += stk[-1].nfirstpos;
/* The lastpos is the union of the lastpos of each argument. */
- nlastpos[-2] += nlastpos[-1];
- --nlastpos;
+ stk[-2].nlastpos += stk[-1].nlastpos;
/* An OR node is nullable if either argument is nullable. */
- nullable[-2] = nullable[-1] || nullable[-2];
- --nullable;
+ stk[-2].nullable |= stk[-1].nullable;
+ stk--;
break;
default:
@@ -2529,10 +2500,12 @@ dfaanalyze (struct dfa *d, int searchflag)
an "epsilon closure" effectively makes them nullable later.
Backreferences have to get a real position so we can detect
transitions on them later. But they are nullable. */
- *nullable++ = d->tokens[i] == BACKREF;
+ stk->nullable = d->tokens[i] == BACKREF;
/* This position is in its own firstpos and lastpos. */
- *nfirstpos++ = *nlastpos++ = 1;
+ stk->nfirstpos = stk->nlastpos = 1;
+ stk++;
+
--firstpos, --lastpos;
firstpos->index = lastpos->index = i;
firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
@@ -2546,15 +2519,16 @@ dfaanalyze (struct dfa *d, int searchflag)
fprintf (stderr, "node %zd:", i);
prtok (d->tokens[i]);
putc ('\n', stderr);
- fprintf (stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
+ fprintf (stderr,
+ stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
fprintf (stderr, " firstpos:");
- for (j = nfirstpos[-1]; j-- > 0;)
+ for (j = stk[-1].nfirstpos; j-- > 0;)
{
fprintf (stderr, " %zd:", firstpos[j].index);
prtok (d->tokens[firstpos[j].index]);
}
fprintf (stderr, "\n lastpos:");
- for (j = nlastpos[-1]; j-- > 0;)
+ for (j = stk[-1].nlastpos; j-- > 0;)
{
fprintf (stderr, " %zd:", lastpos[j].index);
prtok (d->tokens[lastpos[j].index]);
@@ -2591,25 +2565,18 @@ dfaanalyze (struct dfa *d, int searchflag)
/* Get the epsilon closure of the firstpos of the regexp. The result will
be the set of positions of state 0. */
merged.nelem = 0;
- for (i = 0; i < nfirstpos[-1]; ++i)
+ for (i = 0; i < stk[-1].nfirstpos; ++i)
insert (firstpos[i], &merged);
epsclosure (&merged, d);
/* Build the initial state. */
- d->salloc = 1;
- d->sindex = 0;
- MALLOC (d->states, d->salloc);
-
separate_contexts = state_separate_contexts (&merged);
state_index (d, &merged,
(separate_contexts & CTX_NEWLINE
? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
- free (o_nullable);
- free (o_nfirst);
- free (o_firstpos);
- free (o_nlast);
- free (o_lastpos);
+ free (posalloc);
+ free (stkalloc);
free (merged.elems);
}
@@ -2647,16 +2614,16 @@ dfaanalyze (struct dfa *d, int searchflag)
void
dfastate (state_num s, struct dfa *d, state_num trans[])
{
- leaf_set *grps; /* As many as will ever be needed. */
- charclass *labels; /* Labels corresponding to the groups. */
+ leaf_set grps[NOTCHAR]; /* As many as will ever be needed. */
+ charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
size_t ngrps = 0; /* Number of groups actually used. */
position pos; /* Current position being considered. */
charclass matches; /* Set of matching characters. */
- int matchesf; /* True if matches is nonempty. */
+ unsigned int matchesf; /* Nonzero if matches is nonempty. */
charclass intersect; /* Intersection with some label set. */
- int intersectf; /* True if intersect is nonempty. */
+ unsigned int intersectf; /* Nonzero if intersect is nonempty. */
charclass leftovers; /* Stuff in the label that didn't match. */
- int leftoversf; /* True if leftovers is nonempty. */
+ unsigned int leftoversf; /* Nonzero if leftovers is nonempty. */
position_set follows; /* Union of the follows of some group. */
position_set tmp; /* Temporary space for merging sets. */
int possible_contexts; /* Contexts that this group can match. */
@@ -2664,12 +2631,9 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_num state; /* New state. */
state_num state_newline; /* New state on a newline transition. */
state_num state_letter; /* New state on a letter transition. */
- int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
+ bool next_isnt_1st_byte = false; /* Flag if we can't add state0. */
size_t i, j, k;
- MALLOC (grps, NOTCHAR);
- MALLOC (labels, NOTCHAR);
-
zeroset (matches);
for (i = 0; i < d->states[s].elems.nelem; ++i)
@@ -2732,7 +2696,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
matches. */
intersectf = 0;
for (k = 0; k < CHARCLASS_INTS; ++k)
- (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
+ intersectf |= intersect[k] = matches[k] & labels[j][k];
if (!intersectf)
continue;
@@ -2743,8 +2707,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* Even an optimizing compiler can't know this for sure. */
int match = matches[k], label = labels[j][k];
- (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
- (matches[k] = match & ~label) ? (matchesf = 1) : 0;
+ leftoversf |= leftovers[k] = ~match & label;
+ matchesf |= matches[k] = match & ~label;
}
/* If there were leftovers, create a new group labeled with them. */
@@ -2752,7 +2716,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
{
copyset (leftovers, labels[ngrps]);
copyset (intersect, labels[j]);
- MALLOC (grps[ngrps].elems, d->nleaves);
+ grps[ngrps].elems = xnmalloc (d->nleaves,
+ sizeof *grps[ngrps].elems);
memcpy (grps[ngrps].elems, grps[j].elems,
sizeof (grps[j].elems[0]) * grps[j].nelem);
grps[ngrps].nelem = grps[j].nelem;
@@ -2775,7 +2740,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
{
copyset (matches, labels[ngrps]);
zeroset (matches);
- MALLOC (grps[ngrps].elems, d->nleaves);
+ grps[ngrps].elems = xnmalloc (d->nleaves, sizeof *grps[ngrps].elems);
grps[ngrps].nelem = 1;
grps[ngrps].elems[0] = pos.index;
++ngrps;
@@ -2821,7 +2786,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
insert (d->follows[grps[i].elems[j]].elems[k], &follows);
- if (d->mb_cur_max > 1)
+ if (d->multibyte)
{
/* If a token in follows.elems is not 1st byte of a multibyte
character, or the states of follows must accept the bytes
@@ -2841,12 +2806,12 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
codepoint of <sb a>, it must not be <sb a> but 2nd byte of
<mb A>, so we cannot add state[0]. */
- next_isnt_1st_byte = 0;
+ next_isnt_1st_byte = false;
for (j = 0; j < follows.nelem; ++j)
{
if (!(d->multibyte_prop[follows.elems[j].index] & 1))
{
- next_isnt_1st_byte = 1;
+ next_isnt_1st_byte = true;
break;
}
}
@@ -2855,7 +2820,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* If we are building a searching matcher, throw in the positions
of state 0 as well. */
if (d->searchflag
- && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
+ && (!MBS_SUPPORT || (!d->multibyte || !next_isnt_1st_byte)))
for (j = 0; j < d->states[0].elems.nelem; ++j)
insert (d->states[0].elems.elems[j], &follows);
@@ -2897,8 +2862,31 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
free (grps[i].elems);
free (follows.elems);
free (tmp.elems);
- free (grps);
- free (labels);
+}
+
+/* Make sure D's state arrays are large enough to hold NEW_STATE. */
+static void
+realloc_trans_if_necessary (struct dfa *d, state_num new_state)
+{
+ state_num oldalloc = d->tralloc;
+ if (oldalloc <= new_state)
+ {
+ state_num **realtrans = d->trans ? d->trans - 1 : NULL;
+ size_t newalloc, newalloc1;
+ newalloc1 = new_state + 1;
+ realtrans = x2nrealloc (realtrans, &newalloc1, sizeof *realtrans);
+ realtrans[0] = NULL;
+ d->trans = realtrans + 1;
+ d->tralloc = newalloc = newalloc1 - 1;
+ d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
+ d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
+ d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
+ for (; oldalloc < newalloc; oldalloc++)
+ {
+ d->trans[oldalloc] = NULL;
+ d->fails[oldalloc] = NULL;
+ }
+ }
}
/* Some routines for manipulating a compiled dfa's transition tables.
@@ -2912,7 +2900,7 @@ static void
build_state (state_num s, struct dfa *d)
{
state_num *trans; /* The new transition table. */
- state_num i;
+ state_num i, maxstate;
/* Set an upper limit on the number of transition tables that will ever
exist at once. 1024 is arbitrary. The idea is that the frequently
@@ -2940,30 +2928,17 @@ build_state (state_num s, struct dfa *d)
if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
d->success[s] |= CTX_NONE;
- MALLOC (trans, NOTCHAR);
+ trans = xmalloc (NOTCHAR * sizeof *trans);
dfastate (s, d, trans);
/* Now go through the new transition table, and make sure that the trans
and fail arrays are allocated large enough to hold a pointer for the
largest state mentioned in the table. */
+ maxstate = -1;
for (i = 0; i < NOTCHAR; ++i)
- if (trans[i] >= d->tralloc)
- {
- state_num oldalloc = d->tralloc;
-
- while (trans[i] >= d->tralloc)
- d->tralloc *= 2;
- REALLOC (d->realtrans, d->tralloc + 1);
- d->trans = d->realtrans + 1;
- REALLOC (d->fails, d->tralloc);
- REALLOC (d->success, d->tralloc);
- REALLOC (d->newlines, d->tralloc);
- while (oldalloc < d->tralloc)
- {
- d->trans[oldalloc] = NULL;
- d->fails[oldalloc++] = NULL;
- }
- }
+ if (maxstate < trans[i])
+ maxstate = trans[i];
+ realloc_trans_if_necessary (d, maxstate);
/* Keep the newline transition in a special place so we can use it as
a sentinel. */
@@ -2979,65 +2954,21 @@ build_state (state_num s, struct dfa *d)
static void
build_state_zero (struct dfa *d)
{
- d->tralloc = 1;
+ /* Initial size of the transition tables; must be positive. */
+ int initial_tab_size = 1;
+
+ d->tralloc = 0;
d->trcount = 0;
- CALLOC (d->realtrans, d->tralloc + 1);
- d->trans = d->realtrans + 1;
- CALLOC (d->fails, d->tralloc);
- MALLOC (d->success, d->tralloc);
- MALLOC (d->newlines, d->tralloc);
+ d->trans = NULL;
+ d->fails = NULL;
+ d->success = NULL;
+ d->newlines = NULL;
+ realloc_trans_if_necessary (d, initial_tab_size);
build_state (0, d);
}
/* Multibyte character handling sub-routines for dfaexec. */
-/* The initial state may encounter a byte which is not a single byte character
- nor the first byte of a multibyte character. But it is incorrect for the
- initial state to accept such a byte. For example, in Shift JIS the regular
- expression "\\" accepts the codepoint 0x5c, but should not accept the second
- byte of the codepoint 0x815c. Then the initial state must skip the bytes
- that are not a single byte character nor the first byte of a multibyte
- character. */
-#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
- if (s == 0) \
- { \
- while (inputwcs[p - buf_begin] == 0 \
- && mblen_buf[p - buf_begin] > 0 \
- && (unsigned char const *) p < buf_end) \
- ++p; \
- if ((char *) p >= end) \
- { \
- free (mblen_buf); \
- free (inputwcs); \
- *end = saved_end; \
- return NULL; \
- } \
- }
-
-static void
-realloc_trans_if_necessary (struct dfa *d, state_num new_state)
-{
- /* Make sure that the trans and fail arrays are allocated large enough
- to hold a pointer for the new state. */
- if (new_state >= d->tralloc)
- {
- state_num oldalloc = d->tralloc;
-
- while (new_state >= d->tralloc)
- d->tralloc *= 2;
- REALLOC (d->realtrans, d->tralloc + 1);
- d->trans = d->realtrans + 1;
- REALLOC (d->fails, d->tralloc);
- REALLOC (d->success, d->tralloc);
- REALLOC (d->newlines, d->tralloc);
- while (oldalloc < d->tralloc)
- {
- d->trans[oldalloc] = NULL;
- d->fails[oldalloc++] = NULL;
- }
- }
-}
-
/* Return values of transit_state_singlebyte, and
transit_state_consume_1char. */
typedef enum
@@ -3070,14 +3001,7 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
works = 0;
}
else if (works < 0)
- {
- if (p == buf_end)
- {
- /* At the moment, it must not happen. */
- abort ();
- }
- works = 0;
- }
+ works = 0;
else if (d->fails[works])
{
works = d->fails[works][*p];
@@ -3092,18 +3016,13 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
return rval;
}
-/* Match a "." against the current context. buf_begin[IDX] is the
- current position. Return the length of the match, in bytes.
- POS is the position of the ".". */
+/* Match a "." against the current context. Return the length of the
+ match, in bytes. POS is the position of the ".". */
static int
-match_anychar (struct dfa *d, state_num s, position pos, size_t idx)
+match_anychar (struct dfa *d, state_num s, position pos,
+ wchar_t wc, size_t mbclen)
{
int context;
- wchar_t wc;
- int mbclen;
-
- wc = inputwcs[idx];
- mbclen = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
/* Check syntax bits. */
if (wc == (wchar_t) eolbyte)
@@ -3125,16 +3044,14 @@ match_anychar (struct dfa *d, state_num s, position pos, size_t idx)
}
/* Match a bracket expression against the current context.
- buf_begin[IDX] is the current position.
Return the length of the match, in bytes.
POS is the position of the bracket expression. */
static int
-match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
+match_mb_charset (struct dfa *d, state_num s, position pos,
+ char const *p, wchar_t wc, size_t match_len)
{
size_t i;
- int match; /* Matching succeeded. */
- int match_len; /* Length of the character (or collating element)
- with which this operator matches. */
+ bool match; /* Matching succeeded. */
int op_len; /* Length of the operator. */
char buffer[128];
@@ -3142,9 +3059,6 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
struct mb_char_classes *work_mbc;
int context;
- wchar_t wc; /* Current referring character. */
-
- wc = inputwcs[idx];
/* Check syntax bits. */
if (wc == (wchar_t) eolbyte)
@@ -3165,7 +3079,6 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
/* Assign the current referring operator to work_mbc. */
work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
match = !work_mbc->invert;
- match_len = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
/* Match in range 0-255? */
if (wc < NOTCHAR && work_mbc->cset != -1
@@ -3179,14 +3092,14 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
goto charset_matched;
}
- strncpy (buffer, (char const *) buf_begin + idx, match_len);
+ strncpy (buffer, p, match_len);
buffer[match_len] = '\0';
/* match with an equivalence class? */
for (i = 0; i < work_mbc->nequivs; i++)
{
op_len = strlen (work_mbc->equivs[i]);
- strncpy (buffer, (char const *) buf_begin + idx, op_len);
+ strncpy (buffer, p, op_len);
buffer[op_len] = '\0';
if (strcoll (work_mbc->equivs[i], buffer) == 0)
{
@@ -3199,7 +3112,7 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
for (i = 0; i < work_mbc->ncoll_elems; i++)
{
op_len = strlen (work_mbc->coll_elems[i]);
- strncpy (buffer, (char const *) buf_begin + idx, op_len);
+ strncpy (buffer, p, op_len);
buffer[op_len] = '\0';
if (strcoll (work_mbc->coll_elems[i], buffer) == 0)
@@ -3212,7 +3125,7 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
/* match with a range? */
for (i = 0; i < work_mbc->nranges; i++)
{
- if (work_mbc->range_sts[i] <= wc && wc <= work_mbc->range_ends[i])
+ if (work_mbc->ranges[i].beg <= wc && wc <= work_mbc->ranges[i].end)
goto charset_matched;
}
@@ -3233,27 +3146,25 @@ charset_matched:
array which corresponds to 'd->states[s].mbps.elem'; each element of the
array contains the number of bytes with which the element can match.
- 'idx' is the index from buf_begin, and it is the current position
- in the buffer.
-
The caller MUST free the array which this function return. */
static int *
-check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
+check_matching_with_multibyte_ops (struct dfa *d, state_num s,
+ char const *p, wchar_t wc, size_t mbclen)
{
size_t i;
int *rarray;
- MALLOC (rarray, d->states[s].mbps.nelem);
+ rarray = d->mb_match_lens;
for (i = 0; i < d->states[s].mbps.nelem; ++i)
{
position pos = d->states[s].mbps.elems[i];
switch (d->tokens[pos.index])
{
case ANYCHAR:
- rarray[i] = match_anychar (d, s, pos, idx);
+ rarray[i] = match_anychar (d, s, pos, wc, mbclen);
break;
case MBCSET:
- rarray[i] = match_mb_charset (d, s, pos, idx);
+ rarray[i] = match_mb_charset (d, s, pos, p, wc, mbclen);
break;
default:
break; /* cannot happen. */
@@ -3273,48 +3184,39 @@ check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
static status_transit_state
transit_state_consume_1char (struct dfa *d, state_num s,
unsigned char const **pp,
- int *match_lens, int *mbclen, position_set * pps)
+ wchar_t wc, size_t mbclen,
+ int *match_lens)
{
size_t i, j;
int k;
state_num s1, s2;
- int *work_mbls;
status_transit_state rs = TRANSIT_STATE_DONE;
- /* Calculate the length of the (single/multi byte) character
- to which p points. */
- *mbclen = (mblen_buf[*pp - buf_begin] == 0) ? 1 : mblen_buf[*pp - buf_begin];
+ if (! match_lens && d->states[s].mbps.nelem != 0)
+ match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
+ wc, mbclen);
/* Calculate the state which can be reached from the state 's' by
- consuming '*mbclen' single bytes from the buffer. */
+ consuming 'mbclen' single bytes from the buffer. */
s1 = s;
- for (k = 0; k < *mbclen; k++)
+ for (k = 0; k < mbclen; k++)
{
s2 = s1;
rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
}
- /* Copy the positions contained by 's1' to the set 'pps'. */
- copy (&(d->states[s1].elems), pps);
-
- /* Check (input) match_lens, and initialize if it is NULL. */
- if (match_lens == NULL && d->states[s].mbps.nelem != 0)
- work_mbls = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
- else
- work_mbls = match_lens;
+ copy (&d->states[s1].elems, &d->mb_follows);
/* Add all of the positions which can be reached from 's' by consuming
a single character. */
for (i = 0; i < d->states[s].mbps.nelem; i++)
{
- if (work_mbls[i] == *mbclen)
+ if (match_lens[i] == mbclen)
for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
j++)
- insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], pps);
+ insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
+ &d->mb_follows);
}
- if (match_lens == NULL && work_mbls != NULL)
- free (work_mbls);
-
/* FIXME: this return value is always ignored. */
return rs;
}
@@ -3323,7 +3225,8 @@ transit_state_consume_1char (struct dfa *d, state_num s,
buffer. This function is for some operator which can match with a multi-
byte character or a collating element (which may be multi characters). */
static state_num
-transit_state (struct dfa *d, state_num s, unsigned char const **pp)
+transit_state (struct dfa *d, state_num s, unsigned char const **pp,
+ unsigned char const *end)
{
state_num s1;
int mbclen; /* The length of current input multibyte character. */
@@ -3331,7 +3234,6 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
size_t i, j;
int *match_lens = NULL;
size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */
- position_set follows;
unsigned char const *p1 = *pp;
wchar_t wc;
@@ -3340,7 +3242,9 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
We check whether each of them can match or not. */
{
/* Note: caller must free the return value of this function. */
- match_lens = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
+ mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+ match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
+ wc, mbclen);
for (i = 0; i < nelem; i++)
/* Search the operator which match the longest string,
@@ -3362,26 +3266,25 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
if (rs == TRANSIT_STATE_DONE)
++*pp;
- free (match_lens);
return s1;
}
/* This state has some operators which can match a multibyte character. */
- alloc_position_set (&follows, d->nleaves);
+ d->mb_follows.nelem = 0;
/* 'maxlen' may be longer than the length of a character, because it may
not be a character but a (multi character) collating element.
We enumerate all of the positions which 's' can reach by consuming
'maxlen' bytes. */
- transit_state_consume_1char (d, s, pp, match_lens, &mbclen, &follows);
+ transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens);
- wc = inputwcs[*pp - mbclen - buf_begin];
- s1 = state_index (d, &follows, wchar_context (wc));
+ s1 = state_index (d, &d->mb_follows, wchar_context (wc));
realloc_trans_if_necessary (d, s1);
while (*pp - p1 < maxlen)
{
- transit_state_consume_1char (d, s1, pp, NULL, &mbclen, &follows);
+ mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
+ transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL);
for (i = 0; i < nelem; i++)
{
@@ -3389,51 +3292,15 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
for (j = 0;
j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
- &follows);
+ &d->mb_follows);
}
- wc = inputwcs[*pp - mbclen - buf_begin];
- s1 = state_index (d, &follows, wchar_context (wc));
+ s1 = state_index (d, &d->mb_follows, wchar_context (wc));
realloc_trans_if_necessary (d, s1);
}
- free (match_lens);
- free (follows.elems);
return s1;
}
-
-/* Initialize mblen_buf and inputwcs with data from the next line. */
-
-static void
-prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
-{
-#if MBS_SUPPORT
- unsigned char eol = eolbyte;
- size_t i;
- size_t ilim = end - begin + 1;
-
- buf_begin = (unsigned char *) begin;
-
- for (i = 0; i < ilim; i++)
- {
- size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i, &mbs);
- mblen_buf[i] = nbytes - (nbytes == 1);
- if (begin[i] == eol)
- break;
- while (--nbytes != 0)
- {
- i++;
- mblen_buf[i] = nbytes;
- inputwcs[i] = 0;
- }
- }
-
- buf_end = (unsigned char *) (begin + i);
- mblen_buf[i] = 0;
- inputwcs[i] = 0; /* sentinel */
-#endif /* MBS_SUPPORT */
-}
-
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
buffer, and the shortest possible version thereof. Return a pointer to
@@ -3451,39 +3318,64 @@ dfaexec (struct dfa *d, char const *begin, char *end,
int allow_nl, size_t *count, int *backref)
{
state_num s, s1; /* Current state. */
- unsigned char const *p; /* Current input character. */
+ unsigned char const *p, *mbp; /* Current input character. */
state_num **trans, *t; /* Copy of d->trans so it can be optimized
into a register. */
unsigned char eol = eolbyte; /* Likewise for eolbyte. */
unsigned char saved_end;
+ size_t nlcount = 0;
if (!d->tralloc)
build_state_zero (d);
s = s1 = 0;
- p = (unsigned char const *) begin;
+ p = mbp = (unsigned char const *) begin;
trans = d->trans;
saved_end = *(unsigned char *) end;
*end = eol;
- if (d->mb_cur_max > 1)
+ if (d->multibyte)
{
- MALLOC (mblen_buf, end - begin + 2);
- MALLOC (inputwcs, end - begin + 2);
- memset (&mbs, 0, sizeof (mbstate_t));
- prepare_wc_buf (d, (const char *) p, end);
+ memset (&d->mbs, 0, sizeof d->mbs);
+ if (! d->mb_match_lens)
+ {
+ d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens);
+ alloc_position_set (&d->mb_follows, d->nleaves);
+ }
}
for (;;)
{
- if (d->mb_cur_max > 1)
+ if (d->multibyte)
{
while ((t = trans[s]) != NULL)
{
- if (p > buf_end)
- break;
s1 = s;
- SKIP_REMAINS_MB_IF_INITIAL_STATE (s, p);
+
+ if (s == 0)
+ {
+ /* The initial state may encounter a byte which is not
+ a single byte character nor the first byte of a
+ multibyte character. But it is incorrect for the
+ initial state to accept such a byte. For example,
+ in Shift JIS the regular expression "\\" accepts
+ the codepoint 0x5c, but should not accept the second
+ byte of the codepoint 0x815c. Then the initial
+ state must skip the bytes that are not a single
+ byte character nor the first byte of a multibyte
+ character. */
+ wchar_t wc;
+ while (mbp < p)
+ mbp += mbs_to_wchar (&wc, (char const *) mbp,
+ end - (char const *) mbp, d);
+ p = mbp;
+
+ if ((char *) p >= end)
+ {
+ p = NULL;
+ goto done;
+ }
+ }
if (d->states[s].mbps.nelem == 0)
{
@@ -3495,18 +3387,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
better performance (up to 25% better on [a-z], for
example) and enables support for collating symbols and
equivalence classes. */
- if (backref)
+ if (d->states[s].has_mbcset && backref)
{
*backref = 1;
- free (mblen_buf);
- free (inputwcs);
- *end = saved_end;
- return (char *) p;
+ goto done;
}
/* Can match with a multibyte character (and multi character
collating element). Transition table might be updated. */
- s = transit_state (d, s, &p);
+ s = transit_state (d, s, &p, (unsigned char *) end);
+ mbp = p;
trans = d->trans;
}
}
@@ -3526,27 +3416,28 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
}
- if (s >= 0 && (char *) p <= end && d->fails[s])
+ if ((char *) p > end)
+ {
+ p = NULL;
+ goto done;
+ }
+
+ if (s >= 0 && d->fails[s])
{
if (d->success[s] & sbit[*p])
{
if (backref)
- *backref = (d->states[s].backref != 0);
- if (d->mb_cur_max > 1)
- {
- free (mblen_buf);
- free (inputwcs);
- }
- *end = saved_end;
- return (char *) p;
+ *backref = d->states[s].has_backref;
+ goto done;
}
s1 = s;
- if (d->mb_cur_max > 1)
+ if (d->multibyte)
{
/* Can match with a multibyte character (and multicharacter
collating element). Transition table might be updated. */
- s = transit_state (d, s, &p);
+ s = transit_state (d, s, &p, (unsigned char *) end);
+ mbp = p;
trans = d->trans;
}
else
@@ -3554,31 +3445,18 @@ dfaexec (struct dfa *d, char const *begin, char *end,
continue;
}
- /* If the previous character was a newline, count it. */
- if ((char *) p <= end && p[-1] == eol)
+ /* If the previous character was a newline, count it, and skip
+ checking of multibyte character boundary until here. */
+ if (p[-1] == eol)
{
- if (count)
- ++*count;
-
- if (d->mb_cur_max > 1)
- prepare_wc_buf (d, (const char *) p, end);
- }
-
- /* Check if we've run off the end of the buffer. */
- if ((char *) p > end)
- {
- if (d->mb_cur_max > 1)
- {
- free (mblen_buf);
- free (inputwcs);
- }
- *end = saved_end;
- return NULL;
+ nlcount++;
+ mbp = p;
}
if (s >= 0)
{
- build_state (s, d);
+ if (!d->trans[s])
+ build_state (s, d);
trans = d->trans;
continue;
}
@@ -3591,6 +3469,32 @@ dfaexec (struct dfa *d, char const *begin, char *end,
s = 0;
}
+
+ done:
+ if (count)
+ *count += nlcount;
+ *end = saved_end;
+ return (char *) p;
+}
+
+/* Search through a buffer looking for a potential match for D.
+ Return the offset of the byte after the first potential match.
+ If there is no match, return (size_t) -1. If D lacks a superset
+ so it's not known whether there is a match, return (size_t) -2.
+ BEGIN points to the beginning of the buffer, and END points to the
+ first byte after its end. Store a sentinel byte (usually newline)
+ in *END, so the actual buffer must be one byte longer. If COUNT is
+ non-NULL, increment *COUNT once for each newline processed. */
+size_t
+dfahint (struct dfa *d, char const *begin, char *end, size_t *count)
+{
+ if (! d->superset)
+ return -2;
+ else
+ {
+ char const *match = dfaexec (d->superset, begin, end, 1, count, NULL);
+ return match ? match - begin : -1;
+ }
}
static void
@@ -3599,7 +3503,6 @@ free_mbdata (struct dfa *d)
size_t i;
free (d->multibyte_prop);
- d->multibyte_prop = NULL;
for (i = 0; i < d->nmbcsets; ++i)
{
@@ -3607,8 +3510,7 @@ free_mbdata (struct dfa *d)
struct mb_char_classes *p = &(d->mbcsets[i]);
free (p->chars);
free (p->ch_classes);
- free (p->range_sts);
- free (p->range_ends);
+ free (p->ranges);
for (j = 0; j < p->nequivs; ++j)
free (p->equivs[j]);
@@ -3620,8 +3522,9 @@ free_mbdata (struct dfa *d)
}
free (d->mbcsets);
- d->mbcsets = NULL;
- d->nmbcsets = 0;
+ free (d->mb_follows.elems);
+ free (d->mb_match_lens);
+ d->mb_match_lens = NULL;
}
/* Initialize the components of a dfa that the other routines don't
@@ -3630,22 +3533,7 @@ void
dfainit (struct dfa *d)
{
memset (d, 0, sizeof *d);
-
- d->calloc = 1;
- MALLOC (d->charclasses, d->calloc);
-
- d->talloc = 1;
- MALLOC (d->tokens, d->talloc);
-
- d->mb_cur_max = MB_CUR_MAX;
-
- if (d->mb_cur_max > 1)
- {
- d->nmultibyte_prop = 1;
- MALLOC (d->multibyte_prop, d->nmultibyte_prop);
- d->mbcsets_alloc = 1;
- MALLOC (d->mbcsets, d->mbcsets_alloc);
- }
+ d->multibyte = MB_CUR_MAX > 1;
}
static void
@@ -3672,7 +3560,83 @@ dfaoptimize (struct dfa *d)
}
free_mbdata (d);
- d->mb_cur_max = 1;
+ d->multibyte = false;
+}
+
+static void
+dfasuperset (struct dfa *d)
+{
+ size_t i, j;
+ charclass ccl;
+ bool have_achar = false;
+ bool have_nchar = false;
+ struct dfa *sup = dfaalloc ();
+
+ *sup = *d;
+ sup->multibyte = false;
+ sup->multibyte_prop = NULL;
+ sup->mbcsets = NULL;
+ sup->superset = NULL;
+ sup->states = NULL;
+ sup->sindex = 0;
+ sup->follows = NULL;
+ sup->tralloc = 0;
+ sup->trans = NULL;
+ sup->fails = NULL;
+ sup->success = NULL;
+ sup->newlines = NULL;
+ sup->musts = NULL;
+
+ sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
+ memcpy (sup->charclasses, d->charclasses,
+ d->cindex * sizeof *sup->charclasses);
+
+ sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
+ sup->talloc = d->tindex * 2;
+
+ for (i = j = 0; i < d->tindex; i++)
+ {
+ switch (d->tokens[i])
+ {
+ case ANYCHAR:
+ case MBCSET:
+ case BACKREF:
+ zeroset (ccl);
+ notset (ccl);
+ sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+ sup->tokens[j++] = STAR;
+ if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
+ || d->tokens[i + 1] == PLUS)
+ i++;
+ have_achar = true;
+ break;
+ case BEGWORD:
+ case ENDWORD:
+ case LIMWORD:
+ case NOTLIMWORD:
+ if (d->multibyte)
+ {
+ /* Ignore these constraints. */
+ sup->tokens[j++] = EMPTY;
+ break;
+ }
+ default:
+ sup->tokens[j++] = d->tokens[i];
+ if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
+ || d->tokens[i] >= CSET)
+ have_nchar = true;
+ break;
+ }
+ }
+ sup->tindex = j;
+
+ if (have_nchar && (have_achar || d->multibyte))
+ d->superset = sup;
+ else
+ {
+ dfafree (sup);
+ free (sup);
+ }
}
/* Parse and analyze a single string of the given length. */
@@ -3684,7 +3648,10 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
dfaparse (s, len, d);
dfamust (d);
dfaoptimize (d);
+ dfasuperset (d);
dfaanalyze (d, searchflag);
+ if (d->superset)
+ dfaanalyze (d->superset, searchflag);
}
/* Free the storage held by the components of a dfa. */
@@ -3697,34 +3664,46 @@ dfafree (struct dfa *d)
free (d->charclasses);
free (d->tokens);
- if (d->mb_cur_max > 1)
+ if (d->multibyte)
free_mbdata (d);
for (i = 0; i < d->sindex; ++i)
{
free (d->states[i].elems.elems);
- if (MBS_SUPPORT)
- free (d->states[i].mbps.elems);
+ free (d->states[i].mbps.elems);
}
free (d->states);
- for (i = 0; i < d->tindex; ++i)
- free (d->follows[i].elems);
- free (d->follows);
- for (i = 0; i < d->tralloc; ++i)
+
+ if (d->follows)
+ {
+ for (i = 0; i < d->tindex; ++i)
+ free (d->follows[i].elems);
+ free (d->follows);
+ }
+
+ if (d->trans)
{
- free (d->trans[i]);
- free (d->fails[i]);
+ for (i = 0; i < d->tralloc; ++i)
+ {
+ free (d->trans[i]);
+ free (d->fails[i]);
+ }
+
+ free (d->trans - 1);
+ free (d->fails);
+ free (d->newlines);
+ free (d->success);
}
- free (d->realtrans);
- free (d->fails);
- free (d->newlines);
- free (d->success);
+
for (dm = d->musts; dm; dm = ndm)
{
ndm = dm->next;
free (dm->must);
free (dm);
}
+
+ if (d->superset)
+ dfafree (d->superset);
}
/* Having found the postfix representation of the regular expression,
@@ -3815,21 +3794,16 @@ static char *
icatalloc (char *old, char const *new)
{
char *result;
- size_t oldsize = old == NULL ? 0 : strlen (old);
- size_t newsize = new == NULL ? 0 : strlen (new);
+ size_t oldsize;
+ size_t newsize = strlen (new);
if (newsize == 0)
return old;
+ oldsize = strlen (old);
result = xrealloc (old, oldsize + newsize + 1);
memcpy (result + oldsize, new, newsize + 1);
return result;
}
-static char *
-icpyalloc (char const *string)
-{
- return icatalloc (NULL, string);
-}
-
static char *_GL_ATTRIBUTE_PURE
istrstr (char const *lookin, char const *lookfor)
{
@@ -3846,29 +3820,15 @@ istrstr (char const *lookin, char const *lookfor)
static void
freelist (char **cpp)
{
- size_t i;
-
- if (cpp == NULL)
- return;
- for (i = 0; cpp[i] != NULL; ++i)
- {
- free (cpp[i]);
- cpp[i] = NULL;
- }
+ while (*cpp)
+ free (*cpp++);
}
static char **
enlist (char **cpp, char *new, size_t len)
{
size_t i, j;
-
- if (cpp == NULL)
- return NULL;
- if ((new = icpyalloc (new)) == NULL)
- {
- freelist (cpp);
- return NULL;
- }
+ new = memcpy (xmalloc (len + 1), new, len);
new[len] = '\0';
/* Is there already something in the list that's new (or longer)? */
for (i = 0; cpp[i] != NULL; ++i)
@@ -3891,53 +3851,35 @@ enlist (char **cpp, char *new, size_t len)
cpp[i] = NULL;
}
/* Add the new string. */
- REALLOC (cpp, i + 2);
+ cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
cpp[i] = new;
cpp[i + 1] = NULL;
return cpp;
}
/* Given pointers to two strings, return a pointer to an allocated
- list of their distinct common substrings. Return NULL if something
- seems wild. */
+ list of their distinct common substrings. */
static char **
comsubs (char *left, char const *right)
{
- char **cpp;
+ char **cpp = xzalloc (sizeof *cpp);
char *lcp;
- char *rcp;
- size_t i, len;
-
- if (left == NULL || right == NULL)
- return NULL;
- cpp = malloc (sizeof *cpp);
- if (cpp == NULL)
- return NULL;
- cpp[0] = NULL;
+
for (lcp = left; *lcp != '\0'; ++lcp)
{
- len = 0;
- rcp = strchr (right, *lcp);
+ size_t len = 0;
+ char *rcp = strchr (right, *lcp);
while (rcp != NULL)
{
+ size_t i;
for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
continue;
if (i > len)
len = i;
rcp = strchr (rcp + 1, *lcp);
}
- if (len == 0)
- continue;
- {
- char **p = enlist (cpp, lcp, len);
- if (p == NULL)
- {
- freelist (cpp);
- cpp = NULL;
- break;
- }
- cpp = p;
- }
+ if (len != 0)
+ cpp = enlist (cpp, lcp, len);
}
return cpp;
}
@@ -3945,16 +3887,8 @@ comsubs (char *left, char const *right)
static char **
addlists (char **old, char **new)
{
- size_t i;
-
- if (old == NULL || new == NULL)
- return NULL;
- for (i = 0; new[i] != NULL; ++i)
- {
- old = enlist (old, new[i], strlen (new[i]));
- if (old == NULL)
- break;
- }
+ for (; *new; new++)
+ old = enlist (old, *new, strlen (*new));
return old;
}
@@ -3963,125 +3897,127 @@ addlists (char **old, char **new)
static char **
inboth (char **left, char **right)
{
- char **both;
- char **temp;
+ char **both = xzalloc (sizeof *both);
size_t lnum, rnum;
- if (left == NULL || right == NULL)
- return NULL;
- both = malloc (sizeof *both);
- if (both == NULL)
- return NULL;
- both[0] = NULL;
for (lnum = 0; left[lnum] != NULL; ++lnum)
{
for (rnum = 0; right[rnum] != NULL; ++rnum)
{
- temp = comsubs (left[lnum], right[rnum]);
- if (temp == NULL)
- {
- freelist (both);
- return NULL;
- }
+ char **temp = comsubs (left[lnum], right[rnum]);
both = addlists (both, temp);
freelist (temp);
free (temp);
- if (both == NULL)
- return NULL;
}
}
return both;
}
-typedef struct
+typedef struct must must;
+
+struct must
{
char **in;
char *left;
char *right;
char *is;
-} must;
+ bool begline;
+ bool endline;
+ must *prev;
+};
+
+static must *
+allocmust (must *mp)
+{
+ must *new_mp = xmalloc (sizeof *new_mp);
+ new_mp->in = xzalloc (sizeof *new_mp->in);
+ new_mp->left = xzalloc (2);
+ new_mp->right = xzalloc (2);
+ new_mp->is = xzalloc (2);
+ new_mp->begline = false;
+ new_mp->endline = false;
+ new_mp->prev = mp;
+ return new_mp;
+}
static void
-resetmust (must * mp)
+resetmust (must *mp)
{
+ freelist (mp->in);
+ mp->in[0] = NULL;
mp->left[0] = mp->right[0] = mp->is[0] = '\0';
+ mp->begline = false;
+ mp->endline = false;
+}
+
+static void
+freemust (must *mp)
+{
freelist (mp->in);
+ free (mp->in);
+ free (mp->left);
+ free (mp->right);
+ free (mp->is);
+ free (mp);
}
static void
dfamust (struct dfa *d)
{
- must *musts;
- must *mp;
- char *result;
+ must *mp = NULL;
+ char const *result = "";
size_t ri;
size_t i;
- int exact;
- token t;
- static must must0;
+ bool exact = false;
+ bool begline = false;
+ bool endline = false;
struct dfamust *dm;
- static char empty_string[] = "";
-
- result = empty_string;
- exact = 0;
- MALLOC (musts, d->tindex + 1);
- mp = musts;
- for (i = 0; i <= d->tindex; ++i)
- mp[i] = must0;
- for (i = 0; i <= d->tindex; ++i)
- {
- mp[i].in = xmalloc (sizeof *mp[i].in);
- mp[i].left = xmalloc (2);
- mp[i].right = xmalloc (2);
- mp[i].is = xmalloc (2);
- mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
- mp[i].in[0] = NULL;
- }
-#ifdef DEBUG
- fprintf (stderr, "dfamust:\n");
- for (i = 0; i < d->tindex; ++i)
- {
- fprintf (stderr, " %zd:", i);
- prtok (d->tokens[i]);
- }
- putc ('\n', stderr);
-#endif
+
for (ri = 0; ri < d->tindex; ++ri)
{
- switch (t = d->tokens[ri])
+ token t = d->tokens[ri];
+ switch (t)
{
+ case BEGLINE:
+ mp = allocmust (mp);
+ mp->begline = true;
+ break;
+ case ENDLINE:
+ mp = allocmust (mp);
+ mp->endline = true;
+ break;
case LPAREN:
case RPAREN:
assert (!"neither LPAREN nor RPAREN may appear here");
+
case EMPTY:
- case BEGLINE:
- case ENDLINE:
case BEGWORD:
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
case BACKREF:
- resetmust (mp);
+ case ANYCHAR:
+ case MBCSET:
+ mp = allocmust (mp);
break;
+
case STAR:
case QMARK:
- assert (musts < mp);
- --mp;
resetmust (mp);
break;
+
case OR:
- assert (&musts[2] <= mp);
{
char **new;
- must *lmp;
- must *rmp;
+ must *rmp = mp;
+ must *lmp = mp = mp->prev;
size_t j, ln, rn, n;
- rmp = --mp;
- lmp = --mp;
/* Guaranteed to be. Unlikely, but ... */
if (!STREQ (lmp->is, rmp->is))
lmp->is[0] = '\0';
+ lmp->begline &= rmp->begline;
+ lmp->endline &= rmp->endline;
/* Left side--easy */
i = 0;
while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
@@ -4100,133 +4036,126 @@ dfamust (struct dfa *d)
lmp->right[j] = lmp->right[(ln - i) + j];
lmp->right[j] = '\0';
new = inboth (lmp->in, rmp->in);
- if (new == NULL)
- goto done;
freelist (lmp->in);
free (lmp->in);
lmp->in = new;
+ freemust (rmp);
}
break;
+
case PLUS:
- assert (musts < mp);
- --mp;
mp->is[0] = '\0';
break;
+
case END:
- assert (mp == &musts[1]);
- for (i = 0; musts[0].in[i] != NULL; ++i)
- if (strlen (musts[0].in[i]) > strlen (result))
- result = musts[0].in[i];
- if (STREQ (result, musts[0].is))
- exact = 1;
+ assert (!mp->prev);
+ for (i = 0; mp->in[i] != NULL; ++i)
+ if (strlen (mp->in[i]) > strlen (result))
+ result = mp->in[i];
+ if (STREQ (result, mp->is))
+ {
+ exact = true;
+ begline = mp->begline;
+ endline = mp->endline;
+ }
goto done;
+
case CAT:
- assert (&musts[2] <= mp);
{
- must *lmp;
- must *rmp;
+ must *rmp = mp;
+ must *lmp = mp = mp->prev;
- rmp = --mp;
- lmp = --mp;
/* In. Everything in left, plus everything in
right, plus concatenation of
left's right and right's left. */
lmp->in = addlists (lmp->in, rmp->in);
- if (lmp->in == NULL)
- goto done;
if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
{
- char *tp;
-
- tp = icpyalloc (lmp->right);
- tp = icatalloc (tp, rmp->left);
- lmp->in = enlist (lmp->in, tp, strlen (tp));
+ size_t lrlen = strlen (lmp->right);
+ size_t rllen = strlen (rmp->left);
+ char *tp = xmalloc (lrlen + rllen);
+ memcpy (tp, lmp->right, lrlen);
+ memcpy (tp + lrlen, rmp->left, rllen);
+ lmp->in = enlist (lmp->in, tp, lrlen + rllen);
free (tp);
- if (lmp->in == NULL)
- goto done;
}
/* Left-hand */
if (lmp->is[0] != '\0')
- {
- lmp->left = icatalloc (lmp->left, rmp->left);
- if (lmp->left == NULL)
- goto done;
- }
+ lmp->left = icatalloc (lmp->left, rmp->left);
/* Right-hand */
if (rmp->is[0] == '\0')
lmp->right[0] = '\0';
lmp->right = icatalloc (lmp->right, rmp->right);
- if (lmp->right == NULL)
- goto done;
/* Guaranteed to be */
- if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
+ if ((lmp->is[0] != '\0' || lmp->begline)
+ && (rmp->is[0] != '\0' || rmp->endline))
{
lmp->is = icatalloc (lmp->is, rmp->is);
- if (lmp->is == NULL)
- goto done;
+ lmp->endline = rmp->endline;
}
else
- lmp->is[0] = '\0';
+ {
+ lmp->is[0] = '\0';
+ lmp->begline = false;
+ lmp->endline = false;
+ }
+ freemust (rmp);
}
break;
+
+ case '\0':
+ /* Not on *my* shift. */
+ goto done;
+
default:
- if (t < END)
- {
- assert (!"oops! t >= END");
- }
- else if (t == '\0')
- {
- /* not on *my* shift */
- goto done;
- }
- else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET)
- {
- /* easy enough */
- resetmust (mp);
- }
- else
+ mp = allocmust (mp);
+ if (CSET <= t)
{
- /* plain character */
- resetmust (mp);
- mp->is[0] = mp->left[0] = mp->right[0] = t;
- mp->is[1] = mp->left[1] = mp->right[1] = '\0';
- mp->in = enlist (mp->in, mp->is, (size_t) 1);
- if (mp->in == NULL)
- goto done;
+ /* If T is a singleton, or if case-folding in a unibyte
+ locale and T's members all case-fold to the same char,
+ convert T to one of its members. Otherwise, do
+ nothing further with T. */
+ charclass *ccl = &d->charclasses[t - CSET];
+ int j;
+ for (j = 0; j < NOTCHAR; j++)
+ if (tstbit (j, *ccl))
+ break;
+ if (! (j < NOTCHAR))
+ break;
+ t = j;
+ while (++j < NOTCHAR)
+ if (tstbit (j, *ccl)
+ && ! (case_fold && !d->multibyte
+ && toupper (j) == toupper (t)))
+ break;
+ if (j < NOTCHAR)
+ break;
}
+ mp->is[0] = mp->left[0] = mp->right[0]
+ = case_fold && !d->multibyte ? toupper (t) : t;
+ mp->is[1] = mp->left[1] = mp->right[1] = '\0';
+ mp->in = enlist (mp->in, mp->is, 1);
break;
}
-#ifdef DEBUG
- fprintf (stderr, " node: %zd:", ri);
- prtok (d->tokens[ri]);
- fprintf (stderr, "\n in:");
- for (i = 0; mp->in[i]; ++i)
- fprintf (stderr, " \"%s\"", mp->in[i]);
- fprintf (stderr, "\n is: \"%s\"\n", mp->is);
- fprintf (stderr, " left: \"%s\"\n", mp->left);
- fprintf (stderr, " right: \"%s\"\n", mp->right);
-#endif
- ++mp;
}
done:
- if (strlen (result))
+ if (*result)
{
- MALLOC (dm, 1);
+ dm = xmalloc (sizeof *dm);
dm->exact = exact;
- dm->must = xmemdup (result, strlen (result) + 1);
+ dm->begline = begline;
+ dm->endline = endline;
+ dm->must = xstrdup (result);
dm->next = d->musts;
d->musts = dm;
}
- mp = musts;
- for (i = 0; i <= d->tindex; ++i)
+
+ while (mp)
{
- freelist (mp[i].in);
- free (mp[i].in);
- free (mp[i].left);
- free (mp[i].right);
- free (mp[i].is);
+ must *prev = mp->prev;
+ freemust (mp);
+ mp = prev;
}
- free (mp);
}
struct dfa *