aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c503
1 files changed, 225 insertions, 278 deletions
diff --git a/dfa.c b/dfa.c
index 228c9090..aecbbd76 100644
--- a/dfa.c
+++ b/dfa.c
@@ -53,8 +53,8 @@
#include "gettext.h"
#define _(str) gettext (str)
-#include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
-#ifdef MBS_SUPPORT
+#include "mbsupport.h" /* defines MBS_SUPPORT to 1 or 0, as appropriate */
+#if MBS_SUPPORT
/* We can handle multibyte strings. */
#include <wchar.h>
#include <wctype.h>
@@ -68,7 +68,15 @@
#define bool int
#define true (1)
#define false (0)
-#endif
+#if ! MBS_SUPPORT
+#define wctype_t int
+#define wint_t int
+#define mbstate_t int
+#define WEOF EOF
+#define towupper toupper
+#define towlower tolower
+#endif /* ! MBS_SUPPORT */
+#endif /* GAWK */
#include "regex.h"
#include "dfa.h"
@@ -109,6 +117,11 @@ is_blank (int c)
/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
typedef int charclass[CHARCLASS_INTS];
+/* Convert a possibly-signed character to an unsigned character. This is
+ a bit safer than casting to unsigned char, since it catches some type
+ errors that the cast doesn't. */
+static inline unsigned char to_uchar (char ch) { return ch; }
+
/* Sometimes characters can only be matched depending on the surrounding
context. Such context decisions depend on what the previous character
was, and the value of the current (lookahead) character. Context
@@ -279,15 +292,12 @@ typedef struct
char backref; /* True if this state matches a \<digit>. */
unsigned char constraint; /* Constraint for this state to accept. */
int first_end; /* Token value of the first END in elems. */
-#if MBS_SUPPORT
position_set mbps; /* Positions which can match multibyte
characters. e.g. period.
These staff are used only if
MB_CUR_MAX > 1. */
-#endif
} dfa_state;
-#if MBS_SUPPORT
/* A bracket operator.
e.g. [a-c], [[:alpha:]], etc. */
struct mb_char_classes
@@ -306,7 +316,6 @@ struct mb_char_classes
char **coll_elems;
int ncoll_elems; /* Collating elements. */
};
-#endif
/* A compiled regular expression. */
struct dfa
@@ -419,19 +428,40 @@ struct dfa
static void dfamust (struct dfa *dfa);
static void regexp (void);
-#define CALLOC(p, t, n) ((p) = xcalloc((size_t)(n), sizeof (t)))
-#define MALLOC(p, t, n) ((p) = xmalloc((n) * sizeof (t)))
-#define REALLOC(p, t, n) ((p) = xrealloc((p), (n) * sizeof (t)))
-
-/* Reallocate an array of type t if nalloc is too small for index. */
-#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
- if ((index) >= (nalloc)) \
- { \
- do \
- (nalloc) *= 2; \
- while ((index) >= (nalloc)); \
- REALLOC(p, t, nalloc); \
- }
+/* These two macros are identical to the ones in gnulib's xalloc.h,
+ except that they not to case the result to "(t *)", and thus may
+ be used via type-free CALLOC and MALLOC macros. */
+#undef XNMALLOC
+#undef XCALLOC
+
+/* Allocate memory for N elements of type T, with error checking. */
+/* extern t *XNMALLOC (size_t n, typename t); */
+# define XNMALLOC(n, t) \
+ (sizeof (t) == 1 ? xmalloc (n) : xnmalloc (n, sizeof (t)))
+
+/* Allocate memory for N elements of type T, with error checking,
+ and zero it. */
+/* extern t *XCALLOC (size_t n, typename t); */
+# define XCALLOC(n, t) \
+ (sizeof (t) == 1 ? xzalloc (n) : xcalloc (n, sizeof (t)))
+
+#define CALLOC(p, n) do { (p) = XCALLOC (n, *(p)); } while (0)
+#define MALLOC(p, n) do { (p) = XNMALLOC (n, *(p)); } while (0)
+#define REALLOC(p, n) do {(p) = xnrealloc (p, n, sizeof (*(p))); } while (0)
+
+/* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */
+#define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \
+ do \
+ { \
+ assert (0 <= (n_required)); \
+ if ((n_alloc) <= (n_required)) \
+ { \
+ size_t new_n_alloc = (n_required) + !(p); \
+ (p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p))); \
+ (n_alloc) = new_n_alloc; \
+ } \
+ } \
+ while (false)
#ifdef DEBUG
@@ -464,10 +494,8 @@ prtok (token t)
case OR: s = "OR"; break;
case LPAREN: s = "LPAREN"; break;
case RPAREN: s = "RPAREN"; break;
-#if MBS_SUPPORT
case ANYCHAR: s = "ANYCHAR"; break;
case MBCSET: s = "MBCSET"; break;
-#endif /* MBS_SUPPORT */
default: s = "CSET"; break;
}
fprintf(stderr, "%s", s);
@@ -534,7 +562,7 @@ charclass_index (charclass const s)
for (i = 0; i < dfa->cindex; ++i)
if (equal(s, dfa->charclasses[i]))
return i;
- REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
+ REALLOC_IF_NECESSARY(dfa->charclasses, dfa->calloc, dfa->cindex + 1);
++dfa->cindex;
copyset(s, dfa->charclasses[i]);
return i;
@@ -587,7 +615,8 @@ setbit_c (int b, charclass c)
setbit (b, c);
}
#else
-#define setbit_c setbit
+# define setbit_c setbit
+static inline bool setbit_wc (wint_t wc, charclass c) { abort (); }
#endif
/* Like setbit_c, but if case is folded, set both cases of a letter. For
@@ -597,7 +626,6 @@ setbit_c (int b, charclass c)
static void
setbit_case_fold_c (int b, charclass c)
{
-#if MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
wint_t wc = btowc (b);
@@ -608,7 +636,6 @@ setbit_case_fold_c (int b, charclass c)
setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
}
else
-#endif
{
setbit (b, c);
if (case_fold && isalpha (b))
@@ -626,7 +653,7 @@ using_utf8 (void)
static int utf8 = -1;
if (utf8 == -1)
{
-#if defined HAVE_LANGINFO_CODESET && defined MBS_SUPPORT
+#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
#else
utf8 = 0;
@@ -651,7 +678,6 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */
static int cur_mb_len = 1; /* Length of the multibyte representation of
wctok. */
-#if MBS_SUPPORT
/* These variables are used only if (MB_CUR_MAX > 1). */
static mbstate_t mbs; /* Mbstate for mbrlen(). */
static wchar_t wctok; /* Wide character representation of the current
@@ -674,7 +700,6 @@ static wchar_t *inputwcs; /* Wide character representation of input
And inputwcs[i] is the codepoint. */
static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
static unsigned char const *buf_end; /* reference to end in dfaexec(). */
-#endif /* MBS_SUPPORT */
#if MBS_SUPPORT
@@ -696,7 +721,7 @@ static unsigned char const *buf_end; /* reference to end in dfaexec(). */
{ \
cur_mb_len = 1; \
--lexleft; \
- (wc) = (c) = (unsigned char) *lexptr++; \
+ (wc) = (c) = to_uchar (*lexptr++); \
} \
else \
{ \
@@ -725,7 +750,7 @@ static unsigned char const *buf_end; /* reference to end in dfaexec(). */
else \
return lasttok = END; \
} \
- (c) = (unsigned char) *lexptr++; \
+ (c) = to_uchar (*lexptr++); \
--lexleft; \
} while(0)
@@ -788,8 +813,8 @@ parse_bracket_exp (void)
Bit 3 = includes ranges, char/equiv classes or collation elements. */
int colon_warning_state;
-#if MBS_SUPPORT
- wint_t wc, wc1, wc2;
+ wint_t wc;
+ wint_t wc2;
/* Work area to build a mb_char_classes. */
struct mb_char_classes *work_mbc;
@@ -801,8 +826,7 @@ parse_bracket_exp (void)
ch_classes_al = equivs_al = coll_elems_al = 0;
if (MB_CUR_MAX > 1)
{
- REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
- dfa->mbcsets_alloc, dfa->nmbcsets + 1);
+ REALLOC_IF_NECESSARY(dfa->mbcsets, dfa->mbcsets_alloc, dfa->nmbcsets + 1);
/* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
We will update dfa->multibyte_prop[] in addtok(), because we can't
@@ -814,7 +838,6 @@ parse_bracket_exp (void)
}
else
work_mbc = NULL;
-#endif
memset (ccl, 0, sizeof ccl);
FETCH_WC (c, wc, _("unbalanced ["));
@@ -826,6 +849,7 @@ parse_bracket_exp (void)
else
invert = 0;
+ wint_t wc1 = 0;
colon_warning_state = (c == ':');
do
{
@@ -844,10 +868,8 @@ parse_bracket_exp (void)
/* If pattern contains `[[:', `[[.', or `[[='. */
if (c1 == ':'
-#if MBS_SUPPORT
/* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1. */
|| (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))
-#endif
)
{
size_t len = 0;
@@ -878,39 +900,36 @@ parse_bracket_exp (void)
if (!pred)
dfaerror(_("invalid character class"));
-#if MBS_SUPPORT
if (MB_CUR_MAX > 1 && !pred->single_byte_only)
{
/* Store the character class as wctype_t. */
wctype_t wt = wctype (class);
if (ch_classes_al == 0)
- MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
- REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
+ MALLOC(work_mbc->ch_classes, ++ch_classes_al);
+ REALLOC_IF_NECESSARY(work_mbc->ch_classes,
ch_classes_al,
work_mbc->nch_classes + 1);
work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
}
-#endif
for (c2 = 0; c2 < NOTCHAR; ++c2)
if (pred->func(c2))
setbit_case_fold_c (c2, ccl);
}
-#if MBS_SUPPORT
- else if (c1 == '=' || c1 == '.')
+ else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
{
char *elem;
- MALLOC(elem, char, len + 1);
+ MALLOC(elem, len + 1);
strncpy(elem, str, len + 1);
if (c1 == '=')
/* build equivalent class. */
{
if (equivs_al == 0)
- MALLOC(work_mbc->equivs, char*, ++equivs_al);
- REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
+ MALLOC(work_mbc->equivs, ++equivs_al);
+ REALLOC_IF_NECESSARY(work_mbc->equivs,
equivs_al,
work_mbc->nequivs + 1);
work_mbc->equivs[work_mbc->nequivs++] = elem;
@@ -920,14 +939,13 @@ parse_bracket_exp (void)
/* build collating element. */
{
if (coll_elems_al == 0)
- MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
- REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
+ MALLOC(work_mbc->coll_elems, ++coll_elems_al);
+ REALLOC_IF_NECESSARY(work_mbc->coll_elems,
coll_elems_al,
work_mbc->ncoll_elems + 1);
work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
}
}
-#endif
colon_warning_state |= 8;
/* Fetch new lookahead character. */
@@ -964,19 +982,18 @@ parse_bracket_exp (void)
&& (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
FETCH_WC(c2, wc2, _("unbalanced ["));
-#if MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
/* When case folding map a range, say [m-z] (or even [M-z])
to the pair of ranges, [m-z] [M-Z]. */
if (range_sts_al == 0)
{
- MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
- MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
+ MALLOC(work_mbc->range_sts, ++range_sts_al);
+ MALLOC(work_mbc->range_ends, ++range_ends_al);
}
- REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+ REALLOC_IF_NECESSARY(work_mbc->range_sts,
range_sts_al, work_mbc->nranges + 1);
- REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+ REALLOC_IF_NECESSARY(work_mbc->range_ends,
range_ends_al, work_mbc->nranges + 1);
work_mbc->range_sts[work_mbc->nranges] =
case_fold ? towlower(wc) : (wchar_t)wc;
@@ -986,17 +1003,16 @@ parse_bracket_exp (void)
#ifndef GREP
if (case_fold && (iswalpha(wc) || iswalpha(wc2)))
{
- REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+ REALLOC_IF_NECESSARY(work_mbc->range_sts,
range_sts_al, work_mbc->nranges + 1);
work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
- REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+ REALLOC_IF_NECESSARY(work_mbc->range_ends,
range_ends_al, work_mbc->nranges + 1);
work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
}
#endif
}
else
-#endif
{
c1 = c;
if (case_fold)
@@ -1015,45 +1031,39 @@ parse_bracket_exp (void)
colon_warning_state |= (c == ':') ? 2 : 4;
-#if MBS_SUPPORT
- if (MB_CUR_MAX > 1)
+ if (MB_CUR_MAX == 1)
{
- if (case_fold && iswalpha(wc))
- {
- wc = towlower(wc);
- if (!setbit_wc (wc, ccl))
- {
- REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = wc;
- }
-#ifdef GREP
- continue;
-#else
- wc = towupper(wc);
-#endif
- }
+ setbit_case_fold_c (c, ccl);
+ continue;
+ }
+
+ if (case_fold && iswalpha(wc))
+ {
+ wc = towlower(wc);
if (!setbit_wc (wc, ccl))
{
- REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+ REALLOC_IF_NECESSARY(work_mbc->chars, chars_al,
work_mbc->nchars + 1);
work_mbc->chars[work_mbc->nchars++] = wc;
}
- }
- else
+#ifdef GREP
+ continue;
+#else
+ wc = towupper(wc);
#endif
- setbit_case_fold_c (c, ccl);
+ }
+ if (!setbit_wc (wc, ccl))
+ {
+ REALLOC_IF_NECESSARY(work_mbc->chars, chars_al,
+ work_mbc->nchars + 1);
+ work_mbc->chars[work_mbc->nchars++] = wc;
+ }
}
- while ((
-#if MBS_SUPPORT
- wc = wc1,
-#endif
- (c = c1) != ']'));
+ while ((wc = wc1, (c = c1) != ']'));
if (colon_warning_state == 7)
dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
-#if MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
static charclass zeroclass;
@@ -1061,13 +1071,10 @@ parse_bracket_exp (void)
work_mbc->cset = equal(ccl, zeroclass) ? -1 : charclass_index(ccl);
return MBCSET;
}
-#endif
if (invert)
{
-#if MBS_SUPPORT
assert(MB_CUR_MAX == 1);
-#endif
notset(ccl);
if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit(eolbyte, ccl);
@@ -1095,7 +1102,6 @@ lex (void)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
-#if MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
FETCH_WC (c, wctok, NULL);
@@ -1103,7 +1109,6 @@ lex (void)
goto normal_char;
}
else
-#endif /* MBS_SUPPORT */
FETCH(c, NULL);
switch (c)
@@ -1326,7 +1331,6 @@ lex (void)
case '.':
if (backslash)
goto normal_char;
-#if MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
/* In multibyte environment period must match with a single
@@ -1334,7 +1338,6 @@ lex (void)
laststart = 0;
return lasttok = ANYCHAR;
}
-#endif /* MBS_SUPPORT */
zeroset(ccl);
notset(ccl);
if (!(syntax_bits & RE_DOT_NEWLINE))
@@ -1379,12 +1382,10 @@ lex (void)
default:
normal_char:
laststart = 0;
-#if MBS_SUPPORT
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
if (MB_CUR_MAX > 1)
return lasttok = WCHAR;
-#endif
if (case_fold && isalpha(c))
{
@@ -1415,18 +1416,14 @@ static int depth; /* Current depth of a hypothetical stack
static void
addtok_mb (token t, int mbprop)
{
-#if MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
- REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
- dfa->tindex);
+ REALLOC_IF_NECESSARY(dfa->multibyte_prop, dfa->nmultibyte_prop,
+ dfa->tindex + 1);
dfa->multibyte_prop[dfa->tindex] = mbprop;
}
-#else
- (void) mbprop;
-#endif
- REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
+ REALLOC_IF_NECESSARY(dfa->tokens, dfa->talloc, dfa->tindex + 1);
dfa->tokens[dfa->tindex++] = t;
switch (t)
@@ -1451,16 +1448,13 @@ addtok_mb (token t, int mbprop)
dfa->depth = depth;
}
-#if MBS_SUPPORT
static void addtok_wc (wint_t wc);
-#endif
/* Add the given token to the parse tree, maintaining the depth count and
updating the maximum depth if necessary. */
static void
addtok (token t)
{
-#if MBS_SUPPORT
if (MB_CUR_MAX > 1 && t == MBCSET)
{
bool need_or = false;
@@ -1508,8 +1502,9 @@ addtok (token t)
}
}
else
-#endif
- addtok_mb (t, 3);
+ {
+ addtok_mb (t, 3);
+ }
}
#if MBS_SUPPORT
@@ -1541,10 +1536,14 @@ addtok_wc (wint_t wc)
addtok(CAT);
}
}
+#else
+static void addtok_wc (wint_t wc) {}
+#endif
static void
add_utf8_anychar (void)
{
+#if MBS_SUPPORT
static const charclass utf8_classes[5] = {
{ 0, 0, 0, 0, ~0, ~0, 0, 0 }, /* 80-bf: non-lead bytes */
{ ~0, ~0, ~0, ~0, 0, 0, 0, 0 }, /* 00-7f: 1-byte sequence */
@@ -1560,7 +1559,7 @@ add_utf8_anychar (void)
for (i = 0; i < n; i++)
{
charclass c;
- memcpy (c, utf8_classes[i], sizeof c);
+ copyset (utf8_classes[i], c);
if (i == 1)
{
if (!(syntax_bits & RE_DOT_NEWLINE))
@@ -1589,8 +1588,8 @@ add_utf8_anychar (void)
addtok (CAT);
addtok (OR);
}
-}
#endif
+}
/* The grammar understood by the parser is as follows.
@@ -1634,8 +1633,7 @@ atom (void)
{
/* empty */
}
-#if MBS_SUPPORT
- else if (tok == WCHAR)
+ else if (MBS_SUPPORT && tok == WCHAR)
{
addtok_wc (case_fold ? towlower(wctok) : wctok);
#ifndef GREP
@@ -1648,8 +1646,7 @@ atom (void)
tok = lex();
}
-
- else if (tok == ANYCHAR && using_utf8())
+ else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8())
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1661,8 +1658,6 @@ atom (void)
add_utf8_anychar();
tok = lex();
}
-#endif /* MBS_SUPPORT */
-
else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|| tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
#if MBS_SUPPORT
@@ -1715,11 +1710,9 @@ copytoks (int tindex, int ntokens)
for (i = 0; i < ntokens; ++i)
{
addtok(dfa->tokens[tindex + i]);
-#if MBS_SUPPORT
/* Update index into multibyte csets. */
if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
-#endif
}
}
@@ -1799,13 +1792,11 @@ dfaparse (char const *s, size_t len, struct dfa *d)
lasttok = END;
laststart = 1;
parens = 0;
-#if MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
cur_mb_len = 0;
memset(&mbs, 0, sizeof mbs);
}
-#endif /* MBS_SUPPORT */
if (! syntax_bits_set)
dfaerror(_("no syntax specified"));
@@ -1833,16 +1824,13 @@ dfaparse (char const *s, size_t len, struct dfa *d)
static void
copy (position_set const *src, position_set *dst)
{
- int i;
-
- for (i = 0; i < src->nelem; ++i)
- dst->elems[i] = src->elems[i];
+ memcpy(dst->elems, src->elems, sizeof(dst->elems[0]) * src->nelem);
dst->nelem = src->nelem;
}
-/* Insert a position in a set. Position sets are maintained in sorted
- order according to index. If position already exists in the set with
- the same index then their constraints are logically or'd together.
+/* Insert position P in set S. S is maintained in sorted order on
+ decreasing index. If there is already an entry in S with P.index
+ then merge (logically-OR) P's constraints into the one in S.
S->elems must point to an array large enough to hold the resulting set. */
static void
insert (position p, position_set *s)
@@ -1852,7 +1840,7 @@ insert (position p, position_set *s)
while (lo < hi)
{
int mid = ((unsigned) lo + (unsigned) hi) >> 1;
- if (s->elems[mid].index < p.index)
+ if (s->elems[mid].index > p.index)
lo = mid + 1;
else
hi = mid;
@@ -1941,19 +1929,20 @@ state_index (struct dfa *d, position_set const *s, int newline, int letter)
}
/* We'll have to create a new state. */
- REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
+ REALLOC_IF_NECESSARY(d->states, d->salloc, d->sindex + 1);
d->states[i].hash = hash;
- MALLOC(d->states[i].elems.elems, position, s->nelem);
+ MALLOC(d->states[i].elems.elems, s->nelem);
copy(s, &d->states[i].elems);
d->states[i].newline = newline;
d->states[i].letter = letter;
d->states[i].backref = 0;
d->states[i].constraint = 0;
d->states[i].first_end = 0;
-#if MBS_SUPPORT
- d->states[i].mbps.nelem = 0;
- d->states[i].mbps.elems = NULL;
-#endif
+ if (MBS_SUPPORT)
+ {
+ d->states[i].mbps.nelem = 0;
+ d->states[i].mbps.elems = NULL;
+ }
for (j = 0; j < s->nelem; ++j)
if (d->tokens[s->elems[j].index] < 0)
{
@@ -1989,7 +1978,7 @@ epsclosure (position_set *s, struct dfa const *d)
char *visited; /* array of booleans, enough to use char, not int */
position p, old;
- CALLOC(visited, char, d->tindex);
+ CALLOC(visited, d->tindex);
for (i = 0; i < s->nelem; ++i)
if (d->tokens[s->elems[i].index] >= NOTCHAR
@@ -2126,20 +2115,20 @@ dfaanalyze (struct dfa *d, int searchflag)
d->searchflag = searchflag;
- MALLOC(nullable, int, d->depth);
+ MALLOC(nullable, d->depth);
o_nullable = nullable;
- MALLOC(nfirstpos, int, d->depth);
+ MALLOC(nfirstpos, d->depth);
o_nfirst = nfirstpos;
- MALLOC(firstpos, position, d->nleaves);
+ MALLOC(firstpos, d->nleaves);
o_firstpos = firstpos, firstpos += d->nleaves;
- MALLOC(nlastpos, int, d->depth);
+ MALLOC(nlastpos, d->depth);
o_nlast = nlastpos;
- MALLOC(lastpos, position, d->nleaves);
+ MALLOC(lastpos, d->nleaves);
o_lastpos = lastpos, lastpos += d->nleaves;
- CALLOC(nalloc, int, d->tindex);
- MALLOC(merged.elems, position, 2 * d->nleaves);
+ CALLOC(nalloc, d->tindex);
+ MALLOC(merged.elems, d->nleaves);
- CALLOC(d->follows, position_set, d->tindex);
+ CALLOC(d->follows, d->tindex);
for (i = 0; i < d->tindex; ++i)
#ifdef DEBUG
@@ -2165,8 +2154,8 @@ dfaanalyze (struct dfa *d, int searchflag)
for (j = 0; j < nlastpos[-1]; ++j)
{
merge(&tmp, &d->follows[pos[j].index], &merged);
- REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
- nalloc[pos[j].index], merged.nelem - 1);
+ REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems,
+ nalloc[pos[j].index], merged.nelem);
copy(&merged, &d->follows[pos[j].index]);
}
@@ -2185,8 +2174,8 @@ dfaanalyze (struct dfa *d, int searchflag)
for (j = 0; j < nlastpos[-2]; ++j)
{
merge(&tmp, &d->follows[pos[j].index], &merged);
- REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
- nalloc[pos[j].index], merged.nelem - 1);
+ REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems,
+ nalloc[pos[j].index], merged.nelem);
copy(&merged, &d->follows[pos[j].index]);
}
@@ -2247,7 +2236,7 @@ dfaanalyze (struct dfa *d, int searchflag)
/* Allocate the follow set for this position. */
nalloc[i] = 1;
- MALLOC(d->follows[i].elems, position, nalloc[i]);
+ MALLOC(d->follows[i].elems, nalloc[i]);
break;
}
#ifdef DEBUG
@@ -2296,7 +2285,7 @@ dfaanalyze (struct dfa *d, int searchflag)
copy(&d->follows[i], &merged);
epsclosure(&merged, d);
if (d->follows[i].nelem < merged.nelem)
- REALLOC(d->follows[i].elems, position, merged.nelem);
+ REALLOC(d->follows[i].elems, merged.nelem);
copy(&merged, &d->follows[i]);
}
@@ -2316,7 +2305,7 @@ dfaanalyze (struct dfa *d, int searchflag)
/* Build the initial state. */
d->salloc = 1;
d->sindex = 0;
- MALLOC(d->states, dfa_state, d->salloc);
+ MALLOC(d->states, d->salloc);
state_index(d, &merged, wants_newline, 0);
free(o_nullable);
@@ -2381,9 +2370,7 @@ dfastate (int s, struct dfa *d, int trans[])
int wants_letter; /* New state wants to know letter context. */
int state_letter; /* New state on a letter transition. */
static int initialized; /* Flag for static initialization. */
-#if MBS_SUPPORT
int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
-#endif
int i, j, k;
grps = xnmalloc (NOTCHAR, sizeof *grps);
@@ -2408,23 +2395,21 @@ dfastate (int s, struct dfa *d, int trans[])
setbit(d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
-#if MBS_SUPPORT
- else if (d->tokens[pos.index] == ANYCHAR
- || d->tokens[pos.index] == MBCSET)
- /* MB_CUR_MAX > 1 */
+ else if (MBS_SUPPORT
+ && (d->tokens[pos.index] == ANYCHAR
+ || d->tokens[pos.index] == MBCSET))
+ /* MB_CUR_MAX > 1 */
{
/* ANYCHAR and MBCSET must match with a single character, so we
must put it to d->states[s].mbps, which contains the positions
which can match with a single character not a byte. */
if (d->states[s].mbps.nelem == 0)
{
- MALLOC(d->states[s].mbps.elems, position,
- d->states[s].elems.nelem);
+ MALLOC(d->states[s].mbps.elems, d->states[s].elems.nelem);
}
insert(pos, &(d->states[s].mbps));
continue;
}
-#endif /* MBS_SUPPORT */
else
continue;
@@ -2488,7 +2473,7 @@ dfastate (int s, struct dfa *d, int trans[])
{
copyset(leftovers, labels[ngrps]);
copyset(intersect, labels[j]);
- MALLOC(grps[ngrps].elems, position, d->nleaves);
+ MALLOC(grps[ngrps].elems, d->nleaves);
copy(&grps[j], &grps[ngrps]);
++ngrps;
}
@@ -2509,15 +2494,15 @@ dfastate (int s, struct dfa *d, int trans[])
{
copyset(matches, labels[ngrps]);
zeroset(matches);
- MALLOC(grps[ngrps].elems, position, d->nleaves);
+ MALLOC(grps[ngrps].elems, d->nleaves);
grps[ngrps].nelem = 1;
grps[ngrps].elems[0] = pos;
++ngrps;
}
}
- MALLOC(follows.elems, position, d->nleaves);
- MALLOC(tmp.elems, position, d->nleaves);
+ MALLOC(follows.elems, d->nleaves);
+ MALLOC(tmp.elems, d->nleaves);
/* If we are a searching matcher, the default transition is to a state
containing the positions of state 0, otherwise the default transition
@@ -2561,7 +2546,6 @@ dfastate (int s, struct dfa *d, int trans[])
for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
-#if MBS_SUPPORT
if (d->mb_cur_max > 1)
{
/* If a token in follows.elems is not 1st byte of a multibyte
@@ -2592,15 +2576,12 @@ dfastate (int s, struct dfa *d, int trans[])
}
}
}
-#endif
/* If we are building a searching matcher, throw in the positions
of state 0 as well. */
-#if MBS_SUPPORT
- if (d->searchflag && (d->mb_cur_max == 1 || !next_isnt_1st_byte))
-#else
- if (d->searchflag)
-#endif
+ if (d->searchflag
+ && (! MBS_SUPPORT
+ || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
for (j = 0; j < d->states[0].elems.nelem; ++j)
insert(d->states[0].elems.elems[j], &follows);
@@ -2697,7 +2678,7 @@ build_state (int s, struct dfa *d)
s, *d))
d->success[s] |= 1;
- MALLOC(trans, int, NOTCHAR);
+ MALLOC(trans, NOTCHAR);
dfastate(s, d, trans);
/* Now go through the new transition table, and make sure that the trans
@@ -2710,11 +2691,11 @@ build_state (int s, struct dfa *d)
while (trans[i] >= d->tralloc)
d->tralloc *= 2;
- REALLOC(d->realtrans, int *, d->tralloc + 1);
+ REALLOC(d->realtrans, d->tralloc + 1);
d->trans = d->realtrans + 1;
- REALLOC(d->fails, int *, d->tralloc);
- REALLOC(d->success, int, d->tralloc);
- REALLOC(d->newlines, int, d->tralloc);
+ REALLOC(d->fails, d->tralloc);
+ REALLOC(d->success, d->tralloc);
+ REALLOC(d->newlines, d->tralloc);
while (oldalloc < d->tralloc)
{
d->trans[oldalloc] = NULL;
@@ -2738,15 +2719,14 @@ build_state_zero (struct dfa *d)
{
d->tralloc = 1;
d->trcount = 0;
- CALLOC(d->realtrans, int *, d->tralloc + 1);
+ CALLOC(d->realtrans, d->tralloc + 1);
d->trans = d->realtrans + 1;
- CALLOC(d->fails, int *, d->tralloc);
- MALLOC(d->success, int, d->tralloc);
- MALLOC(d->newlines, int, d->tralloc);
+ CALLOC(d->fails, d->tralloc);
+ MALLOC(d->success, d->tralloc);
+ MALLOC(d->newlines, d->tralloc);
build_state(0, d);
}
-#if MBS_SUPPORT
/* Multibyte character handling sub-routines for dfaexec. */
/* Initial state may encounter the byte which is not a single byte character
@@ -2783,11 +2763,11 @@ realloc_trans_if_necessary(struct dfa *d, int new_state)
while (new_state >= d->tralloc)
d->tralloc *= 2;
- REALLOC(d->realtrans, int *, d->tralloc + 1);
+ REALLOC(d->realtrans, d->tralloc + 1);
d->trans = d->realtrans + 1;
- REALLOC(d->fails, int *, d->tralloc);
- REALLOC(d->success, int, d->tralloc);
- REALLOC(d->newlines, int, d->tralloc);
+ REALLOC(d->fails, d->tralloc);
+ REALLOC(d->success, d->tralloc);
+ REALLOC(d->newlines, d->tralloc);
while (oldalloc < d->tralloc)
{
d->trans[oldalloc] = NULL;
@@ -2850,11 +2830,9 @@ transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
return rval;
}
-/* Check whether period can match or not in the current context. If it can,
- return the amount of the bytes with which period can match, otherwise
- return 0.
- `pos' is the position of the period. `idx' is the index from the
- buf_begin, and it is the current position in the buffer. */
+/* Match a "." against the current context. buf_begin[IDX] is the
+ current position. Return the length of the match, in bytes.
+ POS is the position of the ".". */
static int
match_anychar (struct dfa *d, int s, position pos, int idx)
{
@@ -2890,11 +2868,10 @@ match_anychar (struct dfa *d, int s, position pos, int idx)
return mbclen;
}
-/* Check whether bracket expression can match or not in the current context.
- If it can, return the amount of the bytes with which expression can match,
- otherwise return 0.
- `pos' is the position of the bracket expression. `idx' is the index
- from the buf_begin, and it is the current position in the buffer. */
+/* Match a bracket expression against the current context.
+ buf_begin[IDX] is the current position.
+ Return the length of the match, in bytes.
+ POS is the position of the bracket expression. */
static int
match_mb_charset (struct dfa *d, int s, position pos, int idx)
{
@@ -3021,7 +2998,7 @@ check_matching_with_multibyte_ops (struct dfa *d, int s, int idx)
int i;
int* rarray;
- MALLOC(rarray, int, d->states[s].mbps.nelem);
+ MALLOC(rarray, d->states[s].mbps.nelem);
for (i = 0; i < d->states[s].mbps.nelem; ++i)
{
position pos = d->states[s].mbps.elems[i];
@@ -3142,7 +3119,7 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
/* This state has some operators which can match a multibyte character. */
follows.nelem = 0;
- MALLOC(follows.elems, position, d->nleaves);
+ MALLOC(follows.elems, d->nleaves);
/* `maxlen' may be longer than the length of a character, because it may
not be a character but a (multi character) collating element.
@@ -3177,11 +3154,13 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
return s1;
}
+
/* Initialize mblen_buf and inputwcs with data from the next line. */
static void
prepare_wc_buf (const char *begin, const char *end)
{
+#if MBS_SUPPORT
unsigned char eol = eolbyte;
size_t remain_bytes, i;
@@ -3222,9 +3201,8 @@ prepare_wc_buf (const char *begin, const char *end)
buf_end = (unsigned char *) (begin + i);
mblen_buf[i] = 0;
inputwcs[i] = 0; /* sentinel */
-}
-
#endif /* MBS_SUPPORT */
+}
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
@@ -3242,7 +3220,7 @@ char *
dfaexec (struct dfa *d, char const *begin, char *end,
int newline, int *count, int *backref)
{
- int s, s1, tmp; /* Current state. */
+ int s, s1; /* Current state. */
unsigned char const *p; /* Current input character. */
int **trans, *t; /* Copy of d->trans so it can be optimized
into a register. */
@@ -3270,19 +3248,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
saved_end = *(unsigned char *) end;
*end = eol;
-#if MBS_SUPPORT
if (d->mb_cur_max > 1)
{
- MALLOC(mblen_buf, unsigned char, end - begin + 2);
- MALLOC(inputwcs, wchar_t, end - begin + 2);
+ MALLOC(mblen_buf, end - begin + 2);
+ MALLOC(inputwcs, end - begin + 2);
memset(&mbs, 0, sizeof(mbstate_t));
prepare_wc_buf ((const char *) p, end);
}
-#endif /* MBS_SUPPORT */
for (;;)
{
-#if MBS_SUPPORT
if (d->mb_cur_max > 1)
while ((t = trans[s]))
{
@@ -3316,15 +3291,18 @@ dfaexec (struct dfa *d, char const *begin, char *end,
trans = d->trans;
}
else
-#endif /* MBS_SUPPORT */
- while ((t = trans[s]) != 0) { /* hand-optimized loop */
- s1 = t[*p++];
- if ((t = trans[s1]) == 0) {
- tmp = s ; s = s1 ; s1 = tmp ; /* swap */
- break;
+ {
+ while ((t = trans[s]) != 0)
+ {
+ s1 = t[*p++];
+ if ((t = trans[s1]) == 0)
+ {
+ int tmp = s; s = s1; s1 = tmp; /* swap */
+ break;
+ }
+ s = t[*p++];
+ }
}
- s = t[*p++];
- }
if (s >= 0 && (char *) p <= end && d->fails[s])
{
@@ -3332,19 +3310,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
{
if (backref)
*backref = (d->states[s].backref != 0);
-#if MBS_SUPPORT
if (d->mb_cur_max > 1)
{
free(mblen_buf);
free(inputwcs);
}
-#endif /* MBS_SUPPORT */
*end = saved_end;
return (char *) p;
}
s1 = s;
-#if MBS_SUPPORT
if (d->mb_cur_max > 1)
{
/* Can match with a multibyte character (and multicharacter
@@ -3353,8 +3328,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
trans = d->trans;
}
else
-#endif /* MBS_SUPPORT */
- s = d->fails[s][*p++];
+ s = d->fails[s][*p++];
continue;
}
@@ -3364,22 +3338,18 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (count)
++*count;
-#if MBS_SUPPORT
if (d->mb_cur_max > 1)
prepare_wc_buf ((const char *) p, end);
-#endif
}
/* Check if we've run off the end of the buffer. */
if ((char *) p > end)
{
-#if MBS_SUPPORT
if (d->mb_cur_max > 1)
{
free(mblen_buf);
free(inputwcs);
}
-#endif /* MBS_SUPPORT */
*end = saved_end;
return NULL;
}
@@ -3401,7 +3371,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
}
-#if MBS_SUPPORT
static void
free_mbdata (struct dfa *d)
{
@@ -3432,7 +3401,6 @@ free_mbdata (struct dfa *d)
d->mbcsets = NULL;
d->nmbcsets = 0;
}
-#endif
/* Initialize the components of a dfa that the other routines don't
initialize for themselves. */
@@ -3442,31 +3410,29 @@ dfainit (struct dfa *d)
memset (d, 0, sizeof *d);
d->calloc = 1;
- MALLOC(d->charclasses, charclass, d->calloc);
+ MALLOC(d->charclasses, d->calloc);
d->talloc = 1;
- MALLOC(d->tokens, token, d->talloc);
+ MALLOC(d->tokens, d->talloc);
-#if MBS_SUPPORT
d->mb_cur_max = MB_CUR_MAX;
+
if (d->mb_cur_max > 1)
{
d->nmultibyte_prop = 1;
- MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
+ MALLOC(d->multibyte_prop, d->nmultibyte_prop);
d->mbcsets_alloc = 1;
- MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
+ MALLOC(d->mbcsets, d->mbcsets_alloc);
}
-#endif
}
-#if MBS_SUPPORT
static void
dfaoptimize (struct dfa *d)
{
- unsigned int i;
- if (!using_utf8())
+ if (!MBS_SUPPORT || !using_utf8())
return;
+ unsigned int i;
for (i = 0; i < d->tindex; ++i)
{
switch(d->tokens[i])
@@ -3485,7 +3451,6 @@ dfaoptimize (struct dfa *d)
free_mbdata (d);
d->mb_cur_max = 1;
}
-#endif
/* Parse and analyze a single string of the given length. */
void
@@ -3494,9 +3459,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
dfainit(d);
dfaparse(s, len, d);
dfamust(d);
-#if MBS_SUPPORT
dfaoptimize(d);
-#endif
dfaanalyze(d, searchflag);
}
@@ -3510,16 +3473,13 @@ dfafree (struct dfa *d)
free(d->charclasses);
free(d->tokens);
-#if MBS_SUPPORT
if (d->mb_cur_max > 1)
free_mbdata(d);
-#endif /* MBS_SUPPORT */
for (i = 0; i < d->sindex; ++i) {
free(d->states[i].elems.elems);
-#if MBS_SUPPORT
- free(d->states[i].mbps.elems);
-#endif /* MBS_SUPPORT */
+ if (MBS_SUPPORT)
+ free(d->states[i].mbps.elems);
}
free(d->states);
for (i = 0; i < d->tindex; ++i)
@@ -3627,23 +3587,15 @@ dfafree (struct dfa *d)
'psi|epsilon' is likelier)? */
static char *
-icatalloc (char const *old, char const *new)
+icatalloc (char *old, char const *new)
{
char *result;
- size_t oldsize, newsize;
-
- newsize = (new == NULL) ? 0 : strlen(new);
- if (old == NULL)
- oldsize = 0;
- else if (newsize == 0)
- return (char *) old;
- else oldsize = strlen(old);
- if (old == NULL)
- result = malloc(newsize + 1);
- else
- result = realloc((void *) old, oldsize + newsize + 1);
- if (result != NULL && new != NULL)
- (void) strcpy(result + oldsize, new);
+ size_t oldsize = old == NULL ? 0 : strlen (old);
+ size_t newsize = new == NULL ? 0 : strlen (new);
+ if (newsize == 0)
+ return old;
+ result = xrealloc (old, oldsize + newsize + 1);
+ strcpy (result + oldsize, new);
return result;
}
@@ -3714,9 +3666,7 @@ enlist (char **cpp, char *new, size_t len)
cpp[i] = NULL;
}
/* Add the new string. */
- cpp = realloc((char *) cpp, (i + 2) * sizeof *cpp);
- if (cpp == NULL)
- return NULL;
+ cpp = xnrealloc(cpp, i + 2, sizeof *cpp);
cpp[i] = new;
cpp[i + 1] = NULL;
return cpp;
@@ -3753,8 +3703,16 @@ comsubs (char *left, char const *right)
}
if (len == 0)
continue;
- if ((cpp = enlist(cpp, lcp, len)) == NULL)
- break;
+ {
+ char **p = enlist (cpp, lcp, len);
+ if (p == NULL)
+ {
+ freelist (cpp);
+ cpp = NULL;
+ break;
+ }
+ cpp = p;
+ }
}
return cpp;
}
@@ -3841,21 +3799,16 @@ dfamust (struct dfa *d)
result = empty_string;
exact = 0;
- musts = malloc((d->tindex + 1) * sizeof *musts);
- if (musts == NULL)
- return;
+ musts = xnmalloc(d->tindex + 1, sizeof *musts);
mp = musts;
for (i = 0; i <= d->tindex; ++i)
mp[i] = must0;
for (i = 0; i <= d->tindex; ++i)
{
- mp[i].in = malloc(sizeof *mp[i].in);
- mp[i].left = malloc(2);
- mp[i].right = malloc(2);
- mp[i].is = malloc(2);
- if (mp[i].in == NULL || mp[i].left == NULL ||
- mp[i].right == NULL || mp[i].is == NULL)
- goto done;
+ mp[i].in = xmalloc(sizeof *mp[i].in);
+ mp[i].left = xmalloc(2);
+ mp[i].right = xmalloc(2);
+ mp[i].is = xmalloc(2);
mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
mp[i].in[0] = NULL;
}
@@ -3962,13 +3915,8 @@ dfamust (struct dfa *d)
char *tp;
tp = icpyalloc(lmp->right);
- if (tp == NULL)
- goto done;
tp = icatalloc(tp, rmp->left);
- if (tp == NULL)
- goto done;
- lmp->in = enlist(lmp->in, tp,
- strlen(tp));
+ lmp->in = enlist(lmp->in, tp, strlen(tp));
free(tp);
if (lmp->in == NULL)
goto done;
@@ -4009,10 +3957,9 @@ dfamust (struct dfa *d)
goto done;
}
else if (t >= CSET
-#if MBS_SUPPORT
+ || !MBS_SUPPORT
|| t == ANYCHAR
|| t == MBCSET
-#endif /* MBS_SUPPORT */
)
{
/* easy enough */
@@ -4045,9 +3992,9 @@ dfamust (struct dfa *d)
done:
if (strlen(result))
{
- MALLOC(dm, struct dfamust, 1);
+ MALLOC(dm, 1);
dm->exact = exact;
- MALLOC(dm->must, char, strlen(result) + 1);
+ MALLOC(dm->must, strlen(result) + 1);
strcpy(dm->must, result);
dm->next = d->musts;
d->musts = dm;