aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog4
-rw-r--r--dfa.c513
2 files changed, 228 insertions, 289 deletions
diff --git a/ChangeLog b/ChangeLog
index 064e375c..2d88b917 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -5,6 +5,10 @@
* debug.c (serialize_list): Renamed from `serialize'.
(unserialize_list): Renamed from `unserialize', for consistency.
+ Unrelated:
+
+ * dfa.c: Sync with GNULIB.
+
2016-11-21 Arnold D. Robbins <arnold@skeeve.com>
* dfa.c: Sync with GNULIB.
diff --git a/dfa.c b/dfa.c
index 02673781..f3939189 100644
--- a/dfa.c
+++ b/dfa.c
@@ -340,7 +340,8 @@ typedef struct
ANYCHAR. */
} dfa_state;
-/* Maximum for any transition table count that exceeds min_trcount. */
+/* Maximum for any transition table count. This should be at least 3,
+ for the initial state setup. */
enum { MAX_TRCOUNT = 1024 };
/* A bracket operator.
@@ -503,22 +504,26 @@ struct dfa
/* Fields filled by dfaexec. */
state_num tralloc; /* Number of transition tables that have
- slots so far, not counting trans[-1]. */
+ slots so far, not counting trans[-1] and
+ trans[-2]. */
int trcount; /* Number of transition tables that have
- actually been built. */
- int min_trcount; /* Minimum of number of transition tables.
- Always keep the number, even after freeing
- the transition tables. It is also the
- number of initial states. */
+ been built, other than for initial
+ states. */
+ int min_trcount; /* Number of initial states. Equivalently,
+ the minimum state number for which trcount
+ counts transitions. */
state_num **trans; /* Transition tables for states that can
never accept. If the transitions for a
state have not yet been computed, or the
state could possibly accept, its entry in
this table is NULL. This points to one
past the start of the allocated array,
- and trans[-1] is always NULL. */
+ and trans[-1] and trans[-2] are always
+ NULL. */
state_num **fails; /* Transition tables after failing to accept
- on a state that potentially could do so. */
+ on a state that potentially could do so.
+ If trans[i] is non-null, fails[i] must
+ be null. */
int *success; /* Table of acceptance conditions used in
dfaexec and computed in build_state. */
state_num *newlines; /* Transitions on newlines. The entry for a
@@ -533,7 +538,8 @@ struct dfa
do not distinguish between their contexts,
as not supported word. */
position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
- state_num **mb_trans; /* Transition tables for states with ANYCHAR. */
+ state_num **mb_trans; /* Transition tables for states with
+ ANYCHAR. */
state_num mb_trcount; /* Number of transition tables for states with
ANYCHAR that have actually been built. */
@@ -715,10 +721,17 @@ zeroset (charclass s)
}
static void
-notset (charclass s)
+fillset (charclass s)
{
int i;
+ for (i = 0; i < CHARCLASS_WORDS; i++)
+ s[i] = CHARCLASS_WORD_MASK;
+}
+static void
+notset (charclass s)
+{
+ int i;
for (i = 0; i < CHARCLASS_WORDS; ++i)
s[i] = CHARCLASS_WORD_MASK & ~s[i];
}
@@ -1429,8 +1442,7 @@ lex (struct dfa *dfa)
goto normal_char;
if (dfa->canychar == (size_t) -1)
{
- zeroset (ccl);
- notset (ccl);
+ fillset (ccl);
if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
clrbit ('\n', ccl);
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
@@ -2498,6 +2510,7 @@ dfaanalyze (struct dfa *d, bool searchflag)
if (separate_contexts & CTX_LETTER)
d->min_trcount = state_index (d, &merged, CTX_LETTER);
d->min_trcount++;
+ d->trcount = 0;
free (posalloc);
free (stkalloc);
@@ -2506,75 +2519,80 @@ dfaanalyze (struct dfa *d, bool searchflag)
}
-/* Find, for each character, the transition out of state s of d, and store
- it in the appropriate slot of trans.
+/* Return the transition out of state s of d for the input character uc,
+ updating the slots in trans accordingly.
- We divide the positions of s into groups (positions can appear in more
- than one group). Each group is labeled with a set of characters that
+ Do not worry about all possible input characters; calculate just the group
+ of positions that match uc. Label it with the set of characters that
every position in the group matches (taking into account, if necessary,
- preceding context information of s). For each group, find the union
- of the its elements' follows. This set is the set of positions of the
+ preceding context information of s). Then find the union
+ of these positions' follows, i.e., the set of positions of the
new state. For each character in the group's label, set the transition
on this character to be to a state corresponding to the set's positions,
and its associated backward context information, if necessary.
- If we are building a searching matcher, we include the positions of state
+ When building a searching matcher, include the positions of state
0 in every state.
- The collection of groups is constructed by building an equivalence-class
+ The group is constructed by building an equivalence-class
partition of the positions of s.
For each position, find the set of characters C that it matches. Eliminate
any characters from C that fail on grounds of backward context.
- Search through the groups, looking for a group whose label L has nonempty
+ Check whether the group's label L has nonempty
intersection with C. If L - C is nonempty, create a new group labeled
L - C and having the same positions as the current group, and set L to
- the intersection of L and C. Insert the position in this group, set
+ the intersection of L and C. Insert the position in the group, set
C = C - L, and resume scanning.
If after comparing with every group there are characters remaining in C,
create a new group labeled with the characters of C and insert this
position in that group. */
-static void
-dfastate (state_num s, struct dfa *d, state_num trans[])
+static state_num
+dfastate (state_num s, struct dfa *d, unsigned char uc, state_num trans[])
{
- leaf_set grps[NOTCHAR]; /* As many as will ever be needed. */
- charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
- size_t ngrps = 0; /* Number of groups actually used. */
- position pos; /* Current position being considered. */
- charclass matches; /* Set of matching characters. */
- charclass_word matchesf; /* Nonzero if matches is nonempty. */
- charclass intersect; /* Intersection with some label set. */
- charclass_word intersectf; /* Nonzero if intersect is nonempty. */
- charclass leftovers; /* Stuff in the label that didn't match. */
- charclass_word leftoversf; /* Nonzero if leftovers is nonempty. */
- position_set follows; /* Union of the follows of some group. */
+ leaf_set group; /* Positions that match the input char. */
+ charclass label; /* The group's label. */
+ position_set follows; /* Union of the follows of the group. */
position_set tmp; /* Temporary space for merging sets. */
- int possible_contexts; /* Contexts that this group can match. */
- int separate_contexts; /* Context that new state wants to know. */
state_num state; /* New state. */
state_num state_newline; /* New state on a newline transition. */
state_num state_letter; /* New state on a letter transition. */
- bool next_isnt_1st_byte = false; /* We can't add state0. */
size_t i, j, k;
#ifdef DEBUG
fprintf (stderr, "build state %td\n", s);
#endif
- zeroset (matches);
+ group.elems = xnmalloc (d->nleaves, sizeof *group.elems);
+ group.nelem = 0;
+
+ fillset (label);
for (i = 0; i < d->states[s].elems.nelem; ++i)
{
- pos = d->states[s].elems.elems[i];
+ charclass matches; /* Set of matching characters. */
+ position pos = d->states[s].elems.elems[i];
+ bool matched = false;
if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
- setbit (d->tokens[pos.index], matches);
+ {
+ zeroset (matches);
+ setbit (d->tokens[pos.index], matches);
+ if (d->tokens[pos.index] == uc)
+ matched = true;
+ }
else if (d->tokens[pos.index] >= CSET)
- copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (d->tokens[pos.index] == ANYCHAR)
{
+ copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
+ if (tstbit (uc, d->charclasses[d->tokens[pos.index] - CSET]))
+ matched = true;
+ }
+ else if (d->tokens[pos.index] == ANYCHAR)
+ {
copyset (d->charclasses[d->canychar], matches);
+ if (tstbit (uc, d->charclasses[d->canychar]))
+ matched = true;
/* ANYCHAR must match with a single character, so we must put
it to D->states[s].mbps which contains the positions which
@@ -2629,155 +2647,69 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
fprintf (stderr, "\n");
#endif
- for (j = 0; j < ngrps; ++j)
+ if (matched)
{
- /* If matches contains a single character only, and the current
- group's label doesn't contain that character, go on to the
- next group. */
- if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
- && !tstbit (d->tokens[pos.index], labels[j]))
- continue;
-
- /* Check if this group's label has a nonempty intersection with
- matches. */
- intersectf = 0;
for (k = 0; k < CHARCLASS_WORDS; ++k)
- intersectf |= intersect[k] = matches[k] & labels[j][k];
- if (!intersectf)
- continue;
-
- /* It does; now find the set differences both ways. */
- leftoversf = matchesf = 0;
- for (k = 0; k < CHARCLASS_WORDS; ++k)
- {
- /* Even an optimizing compiler can't know this for sure. */
- charclass_word match = matches[k], label = labels[j][k];
-
- leftoversf |= leftovers[k] = label & ~match;
- matchesf |= matches[k] = match & ~label;
- }
-
- /* If there were leftovers, create a new group labeled with them. */
- if (leftoversf)
- {
- copyset (leftovers, labels[ngrps]);
- copyset (intersect, labels[j]);
- grps[ngrps].elems = xnmalloc (d->nleaves,
- sizeof *grps[ngrps].elems);
- memcpy (grps[ngrps].elems, grps[j].elems,
- sizeof (grps[j].elems[0]) * grps[j].nelem);
- grps[ngrps].nelem = grps[j].nelem;
- ++ngrps;
- }
-
- /* Put the position in the current group. The constraint is
- irrelevant here. */
- grps[j].elems[grps[j].nelem++] = pos.index;
-
- /* If every character matching the current position has been
- accounted for, we're done. */
- if (!matchesf)
- break;
+ label[k] &= matches[k];
+ group.elems[group.nelem++] = pos.index;
}
-
- /* If we've passed the last group, and there are still characters
- unaccounted for, then we'll have to create a new group. */
- if (j == ngrps)
+ else
{
- copyset (matches, labels[ngrps]);
- zeroset (matches);
- grps[ngrps].elems = xnmalloc (d->nleaves, sizeof *grps[ngrps].elems);
- grps[ngrps].nelem = 1;
- grps[ngrps].elems[0] = pos.index;
- ++ngrps;
+ for (k = 0; k < CHARCLASS_WORDS; ++k)
+ label[k] &= ~matches[k];
}
}
alloc_position_set (&follows, d->nleaves);
alloc_position_set (&tmp, d->nleaves);
- /* If we are a searching matcher, the default transition is to a state
- containing the positions of state 0, otherwise the default transition
- is to fail miserably. */
- if (d->searchflag)
+ if (group.nelem > 0)
{
- int c;
+ int possible_contexts; /* Contexts that the group can match. */
+ int separate_contexts; /* Context that new state wants to know. */
- state_newline = 0;
- state_letter = d->min_trcount - 1;
- state = d->initstate_notbol;
-
- for (c = 0; c < NOTCHAR; ++c)
- {
- switch (d->syntax.sbit[c])
- {
- case CTX_NEWLINE:
- trans[c] = state_newline;
- break;
- case CTX_LETTER:
- trans[c] = state_letter;
- break;
- default:
- trans[c] = state;
- break;
- }
- }
- }
- else
- for (i = 0; i < NOTCHAR; ++i)
- trans[i] = -1;
-
- for (i = 0; i < ngrps; ++i)
- {
follows.nelem = 0;
/* Find the union of the follows of the positions of the group.
This is a hideously inefficient loop. Fix it someday. */
- for (j = 0; j < grps[i].nelem; ++j)
- for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
- insert (d->follows[grps[i].elems[j]].elems[k], &follows);
+ for (j = 0; j < group.nelem; ++j)
+ for (k = 0; k < d->follows[group.elems[j]].nelem; ++k)
+ insert (d->follows[group.elems[j]].elems[k], &follows);
- if (d->localeinfo.multibyte)
+ /* If we are building a searching matcher, throw in the positions
+ of state 0 as well, if possible. */
+ if (d->searchflag)
{
/* If a token in follows.elems is not 1st byte of a multibyte
character, or the states of follows must accept the bytes
which are not 1st byte of the multibyte character.
- Then, if a state of follows encounter a byte, it must not be
- a 1st byte of a multibyte character nor single byte character.
- We cansel to add state[0].follows to next state, because
- state[0] must accept 1st-byte
-
- For example, we assume <sb a> is a certain single byte
- character, <mb A> is a certain multibyte character, and the
- codepoint of <sb a> equals the 2nd byte of the codepoint of
- <mb A>.
- When state[0] accepts <sb a>, state[i] transit to state[i+1]
- by accepting accepts 1st byte of <mb A>, and state[i+1]
- accepts 2nd byte of <mb A>, if state[i+1] encounter the
- codepoint of <sb a>, it must not be <sb a> but 2nd byte of
- <mb A>, so we cannot add state[0]. */
-
- next_isnt_1st_byte = false;
- for (j = 0; j < follows.nelem; ++j)
+ Then, if a state of follows encounters a byte, it must not be
+ a 1st byte of a multibyte character nor a single byte character.
+ In this case, do not add state[0].follows to next state, because
+ state[0] must accept 1st-byte.
+
+ For example, suppose <sb a> is a certain single byte character,
+ <mb A> is a certain multibyte character, and the codepoint of
+ <sb a> equals the 2nd byte of the codepoint of <mb A>. When
+ state[0] accepts <sb a>, state[i] transits to state[i+1] by
+ accepting the 1st byte of <mb A>, and state[i+1] accepts the
+ 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
+ <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
+ not add state[0]. */
+
+ bool mergeit = !d->localeinfo.multibyte;
+ if (!mergeit)
+ for (mergeit = true, j = 0; mergeit && j < follows.nelem; j++)
+ mergeit &= d->multibyte_prop[follows.elems[j].index];
+ if (mergeit)
{
- if (!(d->multibyte_prop[follows.elems[j].index] & 1))
- {
- next_isnt_1st_byte = true;
- break;
- }
+ merge (&d->states[0].elems, &follows, &tmp);
+ copy (&tmp, &follows);
}
}
- /* If we are building a searching matcher, throw in the positions
- of state 0 as well. */
- if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
- {
- merge (&d->states[0].elems, &follows, &tmp);
- copy (&tmp, &follows);
- }
-
/* Find out if the new state will want any context information. */
- possible_contexts = charclass_context (d, labels[i]);
+ possible_contexts = charclass_context (d, label);
separate_contexts = state_separate_contexts (&follows);
/* Find the state(s) corresponding to the union of the follows. */
@@ -2793,51 +2725,39 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_letter = state_index (d, &follows, CTX_LETTER);
else
state_letter = state;
+ }
-#ifdef DEBUG
- fprintf (stderr, "group %zu\n nextpos:", i);
- for (j = 0; j < grps[i].nelem; ++j)
- {
- fprintf (stderr, " %zu:", grps[i].elems[j]);
- prtok (d->tokens[grps[i].elems[j]]);
- }
- fprintf (stderr, "\n follows:");
- for (j = 0; j < follows.nelem; ++j)
+ /* If we are a searching matcher, the default transition is to a state
+ containing the positions of state 0, otherwise the default transition
+ is to fail miserably. */
+ else if (d->searchflag)
+ {
+ state_newline = 0;
+ state_letter = d->min_trcount - 1;
+ state = d->initstate_notbol;
+ }
+ else
+ {
+ state_newline = -1;
+ state_letter = -1;
+ state = -1;
+ }
+
+ /* Set the transitions for each character in the label. */
+ for (i = 0; i < NOTCHAR; i++)
+ if (tstbit (i, label))
+ switch (d->syntax.sbit[i])
{
- fprintf (stderr, " %zu:", follows.elems[j].index);
- prtok (d->tokens[follows.elems[j].index]);
+ case CTX_NEWLINE:
+ trans[i] = state_newline;
+ break;
+ case CTX_LETTER:
+ trans[i] = state_letter;
+ break;
+ default:
+ trans[i] = state;
+ break;
}
- fprintf (stderr, "\n states:");
- if (possible_contexts & CTX_NEWLINE)
- fprintf (stderr, " CTX_NEWLINE:%td", state_newline);
- if (possible_contexts & CTX_LETTER)
- fprintf (stderr, " CTX_LETTER:%td", state_letter);
- if (possible_contexts & CTX_NONE)
- fprintf (stderr, " CTX_NONE:%td", state);
- fprintf (stderr, "\n");
-#endif
-
- /* Set the transitions for each character in the current label. */
- for (j = 0; j < CHARCLASS_WORDS; ++j)
- for (k = 0; k < CHARCLASS_WORD_BITS; ++k)
- if (labels[i][j] >> k & 1)
- {
- int c = j * CHARCLASS_WORD_BITS + k;
-
- switch (d->syntax.sbit[c])
- {
- case CTX_NEWLINE:
- trans[c] = state_newline;
- break;
- case CTX_LETTER:
- trans[c] = state_letter;
- break;
- default:
- trans[c] = state;
- break;
- }
- }
- }
#ifdef DEBUG
fprintf (stderr, "trans table %td", s);
@@ -2850,10 +2770,19 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
fprintf (stderr, "\n");
#endif
- for (i = 0; i < ngrps; ++i)
- free (grps[i].elems);
+ free (group.elems);
free (follows.elems);
free (tmp.elems);
+
+ /* Keep the newline transition in a special place so we can use it as
+ a sentinel. */
+ if (tstbit (d->syntax.eolbyte, label))
+ {
+ d->newlines[s] = trans[d->syntax.eolbyte];
+ trans[d->syntax.eolbyte] = -1;
+ }
+
+ return trans[uc];
}
/* Make sure D's state arrays are large enough to hold NEW_STATE. */
@@ -2863,23 +2792,23 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
state_num oldalloc = d->tralloc;
if (oldalloc <= new_state)
{
- state_num **realtrans = d->trans ? d->trans - 1 : NULL;
+ state_num **realtrans = d->trans ? d->trans - 2 : NULL;
size_t newalloc, newalloc1;
- newalloc1 = new_state + 1;
+ newalloc1 = realtrans ? new_state + 2 : 0;
realtrans = x2nrealloc (realtrans, &newalloc1, sizeof *realtrans);
- realtrans[0] = NULL;
- d->trans = realtrans + 1;
- d->tralloc = newalloc = newalloc1 - 1;
+ realtrans[0] = realtrans[1] = NULL;
+ d->trans = realtrans + 2;
+ d->tralloc = newalloc = newalloc1 - 2;
d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
if (d->localeinfo.multibyte)
{
- realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
+ realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
if (oldalloc == 0)
- realtrans[0] = NULL;
- d->mb_trans = realtrans + 1;
+ realtrans[0] = realtrans[1] = NULL;
+ d->mb_trans = realtrans + 2;
}
for (; oldalloc < newalloc; oldalloc++)
{
@@ -2891,47 +2820,48 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
}
}
-/* Some routines for manipulating a compiled dfa's transition tables.
- Each state may or may not have a transition table; if it does, and it
- is a non-accepting state, then d->trans[state] points to its table.
- If it is an accepting state then d->fails[state] points to its table.
- If it has no table at all, then d->trans[state] is NULL.
- TODO: Improve this comment, get rid of the unnecessary redundancy. */
+/* Calculate the transition table for a new state derived from state s
+ for a compiled dfa d after input character uc, and return the new
+ state number. */
-static void
-build_state (state_num s, struct dfa *d)
+static state_num
+build_state (state_num s, struct dfa *d, unsigned char uc)
{
state_num *trans; /* The new transition table. */
state_num i, maxstate;
- /* Set an upper limit on the number of transition tables that will ever
- exist at once. MAX_TRCOUNT is arbitrary. The idea is that the frequently
- used transition tables will be quickly rebuilt, whereas the ones that
- were only needed once or twice will be cleared away. However, do not
- clear the initial D->min_trcount states, since they are always used. */
- if (MAX_TRCOUNT <= d->trcount)
+ if (d->fails[s] != NULL)
+ trans = d->fails[s];
+ else
{
- for (i = d->min_trcount; i < d->tralloc; ++i)
+ state_num **ptrans = (ACCEPTING (s, *d) ? d->fails : d->trans) + s;
+ if (!*ptrans)
{
- free (d->trans[i]);
- free (d->fails[i]);
- d->trans[i] = d->fails[i] = NULL;
- }
- d->trcount = d->min_trcount;
-
- if (d->localeinfo.multibyte)
- {
- for (i = d->min_trcount; i < d->tralloc; i++)
+ /* MAX_TRCOUNT is an arbitrary upper limit on the number of
+ transition tables that can exist at once, other than for
+ initial states. Often-used transition tables are quickly
+ rebuilt, whereas rarely-used ones are cleared away. */
+ if (MAX_TRCOUNT <= d->trcount)
{
- free (d->mb_trans[i]);
- d->mb_trans[i] = NULL;
+ for (i = d->min_trcount; i < d->tralloc; i++)
+ {
+ free (d->trans[i]);
+ free (d->fails[i]);
+ d->trans[i] = d->fails[i] = NULL;
+ }
+ d->trcount = 0;
}
- free (d->mb_trans[-1]);
- d->mb_trans[-1] = NULL;
+
+ d->trcount++;
+ *ptrans = xmalloc (NOTCHAR * sizeof *trans);
}
- }
+ trans = *ptrans;
- ++d->trcount;
+ /* Fill transition table with a default value which means that the
+ transited state has not been calculated yet. */
+ for (i = 0; i < NOTCHAR; i++)
+ trans[i] = -2;
+ }
/* Set up the success bits for this state. */
d->success[s] = 0;
@@ -2942,8 +2872,7 @@ build_state (state_num s, struct dfa *d)
if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
d->success[s] |= CTX_NONE;
- trans = xmalloc (NOTCHAR * sizeof *trans);
- dfastate (s, d, trans);
+ s = dfastate (s, d, uc, trans);
/* Now go through the new transition table, and make sure that the trans
and fail arrays are allocated large enough to hold a pointer for the
@@ -2954,15 +2883,7 @@ build_state (state_num s, struct dfa *d)
maxstate = trans[i];
realloc_trans_if_necessary (d, maxstate);
- /* Keep the newline transition in a special place so we can use it as
- a sentinel. */
- d->newlines[s] = trans[d->syntax.eolbyte];
- trans[d->syntax.eolbyte] = -1;
-
- if (ACCEPTING (s, *d))
- d->fails[s] = trans;
- else
- d->trans[s] = trans;
+ return s;
}
/* Multibyte character handling sub-routines for dfaexec. */
@@ -2982,7 +2903,7 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
t = d->fails[s];
else
{
- build_state (s, d);
+ build_state (s, d, **pp);
if (d->trans[s])
t = d->trans[s];
else
@@ -2992,6 +2913,9 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
}
}
+ if (t[**pp] == -2)
+ build_state (s, d, **pp);
+
return t[*(*pp)++];
}
@@ -3057,7 +2981,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
return d->mb_trans[s][d->states[s1].mb_trindex];
- if (s < 0)
+ if (s == -1)
copy (&d->states[s1].mbps, &d->mb_follows);
else
merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
@@ -3165,10 +3089,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
}
if (!d->tralloc)
- {
- realloc_trans_if_necessary (d, 1);
- build_state (0, d);
- }
+ realloc_trans_if_necessary (d, 0);
s = s1 = 0;
p = mbp = (unsigned char const *) begin;
@@ -3238,21 +3159,28 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
if (s < 0)
{
- if ((char *) p > end || p[-1] != eol || d->newlines[s1] < 0)
+ if (s == -2)
+ {
+ s = build_state (s1, d, p[-1]);
+ trans = d->trans;
+ }
+ else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
+ {
+ /* The previous character was a newline. Count it, and skip
+ checking of multibyte character boundary until here. */
+ nlcount++;
+ mbp = p;
+
+ s = (allow_nl ? d->newlines[s1]
+ : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
+ : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
+ : d->initstate_notbol);
+ }
+ else
{
p = NULL;
goto done;
}
-
- /* The previous character was a newline, count it, and skip
- checking of multibyte character boundary until here. */
- nlcount++;
- mbp = p;
-
- s = (allow_nl ? d->newlines[s1]
- : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
- : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
- : d->initstate_notbol);
}
else if (d->fails[s])
{
@@ -3282,7 +3210,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
}
else
{
- build_state (s, d);
+ build_state (s, d, p[0]);
trans = d->trans;
}
}
@@ -3362,7 +3290,7 @@ free_mbdata (struct dfa *d)
state_num s;
for (s = -1; s < d->tralloc; s++)
free (d->mb_trans[s]);
- free (d->mb_trans - 1);
+ free (d->mb_trans - 2);
}
}
@@ -3370,6 +3298,12 @@ free_mbdata (struct dfa *d)
static bool _GL_ATTRIBUTE_PURE
dfa_supported (struct dfa const *d)
{
+ /* Declare any non-UTF8 multibyte locale "not supported." Otherwise, a
+ regexp like ".*7" would mistakenly match \uC9, e.g., via this command:
+ (export LC_ALL=zh_CN.gb18030; printf '\uC9\n' | grep '.*7') */
+ if (d->localeinfo.multibyte && !d->localeinfo.using_utf8)
+ return false;
+
size_t i;
for (i = 0; i < d->tindex; i++)
{
@@ -3436,7 +3370,6 @@ static void
dfassbuild (struct dfa *d)
{
size_t i, j;
- charclass ccl;
bool have_achar = false;
bool have_nchar = false;
struct dfa *sup = dfaalloc ();
@@ -3473,14 +3406,16 @@ dfassbuild (struct dfa *d)
case ANYCHAR:
case MBCSET:
case BACKREF:
- zeroset (ccl);
- notset (ccl);
- sup->tokens[j++] = CSET + charclass_index (sup, ccl);
- sup->tokens[j++] = STAR;
- if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
- || d->tokens[i + 1] == PLUS)
- i++;
- have_achar = true;
+ {
+ charclass ccl;
+ fillset (ccl);
+ sup->tokens[j++] = CSET + charclass_index (sup, ccl);
+ sup->tokens[j++] = STAR;
+ if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
+ || d->tokens[i + 1] == PLUS)
+ i++;
+ have_achar = true;
+ }
break;
case BEGWORD:
case ENDWORD:
@@ -3571,7 +3506,7 @@ dfafree (struct dfa *d)
free (d->fails[i]);
}
- free (d->trans - 1);
+ free (d->trans - 2);
free (d->fails);
free (d->newlines);
free (d->success);