aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c105
1 files changed, 79 insertions, 26 deletions
diff --git a/dfa.c b/dfa.c
index 2d0e7f20..6179a3d3 100644
--- a/dfa.c
+++ b/dfa.c
@@ -367,6 +367,9 @@ struct dfa
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
+ /* dfaexec implementation. */
+ char *(*dfaexec) (struct dfa *, char const *, char *, int, size_t *, int *);
+
/* The following are valid only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule.
@@ -3309,6 +3312,24 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
return s1;
}
+/* The initial state may encounter a byte which is not a single byte character
+ nor the first byte of a multibyte character. But it is incorrect for the
+ initial state to accept such a byte. For example, in Shift JIS the regular
+ expression "\\" accepts the codepoint 0x5c, but should not accept the second
+ byte of the codepoint 0x815c. Then the initial state must skip the bytes
+ that are not a single byte character nor the first byte of a multibyte
+ character. */
+static unsigned char const *
+skip_remains_mb (struct dfa *d, unsigned char const *p,
+ unsigned char const *mbp, char const *end)
+{
+ wint_t wc;
+ while (mbp < p)
+ mbp += mbs_to_wchar (&wc, (char const *) mbp,
+ end - (char const *) mbp, d);
+ return mbp;
+}
+
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
buffer, and the shortest possible version thereof. Return a pointer to
@@ -3320,10 +3341,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
If COUNT is non-NULL, increment *COUNT once for each newline processed.
Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
encountered a back-reference (1) or not (0). The caller may use this
- to decide whether to fall back on a backtracking matcher. */
-char *
-dfaexec (struct dfa *d, char const *begin, char *end,
- int allow_nl, size_t *count, int *backref)
+ to decide whether to fall back on a backtracking matcher.
+
+ If MULTIBYTE, the input consists of multibyte characters and/or
+ encoding-error bytes. Otherwise, the input consists of single-byte
+ characters. */
+static inline char *
+dfaexec_main (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref, bool multibyte)
{
state_num s, s1; /* Current state. */
unsigned char const *p, *mbp; /* Current input character. */
@@ -3345,7 +3370,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
saved_end = *(unsigned char *) end;
*end = eol;
- if (d->multibyte)
+ if (multibyte)
{
memset (&d->mbs, 0, sizeof d->mbs);
if (! d->mb_match_lens)
@@ -3357,7 +3382,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
for (;;)
{
- if (d->multibyte)
+ if (multibyte)
{
while ((t = trans[s]) != NULL)
{
@@ -3365,27 +3390,18 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (s == 0)
{
- /* The initial state may encounter a byte which is not
- a single byte character nor the first byte of a
- multibyte character. But it is incorrect for the
- initial state to accept such a byte. For example,
- in Shift JIS the regular expression "\\" accepts
- the codepoint 0x5c, but should not accept the second
- byte of the codepoint 0x815c. Then the initial
- state must skip the bytes that are not a single
- byte character nor the first byte of a multibyte
- character. */
- wint_t wc;
- while (mbp < p)
- mbp += mbs_to_wchar (&wc, (char const *) mbp,
- end - (char const *) mbp, d);
- p = mbp;
-
- if ((char *) p > end)
+ if (d->states[s].mbps.nelem == 0)
{
- p = NULL;
- goto done;
+ do
+ {
+ while (t[*p] == 0)
+ p++;
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+ }
+ while (t[*p] == 0);
}
+ else
+ p = mbp = skip_remains_mb (d, p, mbp, end);
}
if (d->states[s].mbps.nelem == 0)
@@ -3413,6 +3429,13 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
else
{
+ if (s == 0 && (t = trans[s]) != NULL)
+ {
+ while (t[*p] == 0)
+ p++;
+ s = t[*p++];
+ }
+
while ((t = trans[s]) != NULL)
{
s1 = t[*p++];
@@ -3443,7 +3466,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
s1 = s;
- if (d->multibyte)
+ if (multibyte)
{
/* Can match with a multibyte character (and multicharacter
collating element). Transition table might be updated. */
@@ -3488,6 +3511,33 @@ dfaexec (struct dfa *d, char const *begin, char *end,
return (char *) p;
}
+/* Specialized versions of dfaexec_main for multibyte and single-byte
+ cases. This is for performance. */
+
+static char *
+dfaexec_mb (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref)
+{
+ return dfaexec_main (d, begin, end, allow_nl, count, backref, true);
+}
+
+static char *
+dfaexec_sb (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref)
+{
+ return dfaexec_main (d, begin, end, allow_nl, count, backref, false);
+}
+
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte),
+ but faster. */
+
+char *
+dfaexec (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref)
+{
+ return d->dfaexec (d, begin, end, allow_nl, count, backref);
+}
+
struct dfa *
dfasuperset (struct dfa const *d)
{
@@ -3537,6 +3587,7 @@ dfainit (struct dfa *d)
{
memset (d, 0, sizeof *d);
d->multibyte = MB_CUR_MAX > 1;
+ d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
d->fast = !d->multibyte;
}
@@ -3577,6 +3628,7 @@ dfaoptimize (struct dfa *d)
free_mbdata (d);
d->multibyte = false;
+ d->dfaexec = dfaexec_sb;
}
static void
@@ -3590,6 +3642,7 @@ dfassbuild (struct dfa *d)
*sup = *d;
sup->multibyte = false;
+ sup->dfaexec = dfaexec_sb;
sup->multibyte_prop = NULL;
sup->mbcsets = NULL;
sup->superset = NULL;