diff options
Diffstat (limited to 'dfa.c')
-rw-r--r-- | dfa.c | 105 |
1 files changed, 79 insertions, 26 deletions
@@ -375,6 +375,9 @@ struct dfa token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ mbstate_t mbs; /* Multibyte conversion state. */ + /* dfaexec implementation. */ + char *(*dfaexec) (struct dfa *, char const *, char *, int, size_t *, int *); + /* The following are valid only if MB_CUR_MAX > 1. */ /* The value of multibyte_prop[i] is defined by following rule. @@ -3321,6 +3324,24 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, return s1; } +/* The initial state may encounter a byte which is not a single byte character + nor the first byte of a multibyte character. But it is incorrect for the + initial state to accept such a byte. For example, in Shift JIS the regular + expression "\\" accepts the codepoint 0x5c, but should not accept the second + byte of the codepoint 0x815c. Then the initial state must skip the bytes + that are not a single byte character nor the first byte of a multibyte + character. */ +static unsigned char const * +skip_remains_mb (struct dfa *d, unsigned char const *p, + unsigned char const *mbp, char const *end) +{ + wint_t wc; + while (mbp < p) + mbp += mbs_to_wchar (&wc, (char const *) mbp, + end - (char const *) mbp, d); + return mbp; +} + /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, and the shortest possible version thereof. Return a pointer to @@ -3332,10 +3353,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, If COUNT is non-NULL, increment *COUNT once for each newline processed. Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we encountered a back-reference (1) or not (0). The caller may use this - to decide whether to fall back on a backtracking matcher. */ -char * -dfaexec (struct dfa *d, char const *begin, char *end, - int allow_nl, size_t *count, int *backref) + to decide whether to fall back on a backtracking matcher. + + If MULTIBYTE, the input consists of multibyte characters and/or + encoding-error bytes. Otherwise, the input consists of single-byte + characters. */ +static inline char * +dfaexec_main (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref, bool multibyte) { state_num s, s1; /* Current state. */ unsigned char const *p, *mbp; /* Current input character. */ @@ -3357,7 +3382,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, saved_end = *(unsigned char *) end; *end = eol; - if (d->multibyte) + if (multibyte) { memset (&d->mbs, 0, sizeof d->mbs); if (! d->mb_match_lens) @@ -3369,7 +3394,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, for (;;) { - if (d->multibyte) + if (multibyte) { while ((t = trans[s]) != NULL) { @@ -3377,27 +3402,18 @@ dfaexec (struct dfa *d, char const *begin, char *end, if (s == 0) { - /* The initial state may encounter a byte which is not - a single byte character nor the first byte of a - multibyte character. But it is incorrect for the - initial state to accept such a byte. For example, - in Shift JIS the regular expression "\\" accepts - the codepoint 0x5c, but should not accept the second - byte of the codepoint 0x815c. Then the initial - state must skip the bytes that are not a single - byte character nor the first byte of a multibyte - character. */ - wint_t wc; - while (mbp < p) - mbp += mbs_to_wchar (&wc, (char const *) mbp, - end - (char const *) mbp, d); - p = mbp; - - if ((char *) p > end) + if (d->states[s].mbps.nelem == 0) { - p = NULL; - goto done; + do + { + while (t[*p] == 0) + p++; + p = mbp = skip_remains_mb (d, p, mbp, end); + } + while (t[*p] == 0); } + else + p = mbp = skip_remains_mb (d, p, mbp, end); } if (d->states[s].mbps.nelem == 0) @@ -3425,6 +3441,13 @@ dfaexec (struct dfa *d, char const *begin, char *end, } else { + if (s == 0 && (t = trans[s]) != NULL) + { + while (t[*p] == 0) + p++; + s = t[*p++]; + } + while ((t = trans[s]) != NULL) { s1 = t[*p++]; @@ -3455,7 +3478,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, } s1 = s; - if (d->multibyte) + if (multibyte) { /* Can match with a multibyte character (and multicharacter collating element). Transition table might be updated. */ @@ -3500,6 +3523,33 @@ dfaexec (struct dfa *d, char const *begin, char *end, return (char *) p; } +/* Specialized versions of dfaexec_main for multibyte and single-byte + cases. This is for performance. */ + +static char * +dfaexec_mb (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + return dfaexec_main (d, begin, end, allow_nl, count, backref, true); +} + +static char * +dfaexec_sb (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + return dfaexec_main (d, begin, end, allow_nl, count, backref, false); +} + +/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte), + but faster. */ + +char * +dfaexec (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + return d->dfaexec (d, begin, end, allow_nl, count, backref); +} + struct dfa * dfasuperset (struct dfa const *d) { @@ -3549,6 +3599,7 @@ dfainit (struct dfa *d) { memset (d, 0, sizeof *d); d->multibyte = MB_CUR_MAX > 1; + d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; d->fast = !d->multibyte; } @@ -3589,6 +3640,7 @@ dfaoptimize (struct dfa *d) free_mbdata (d); d->multibyte = false; + d->dfaexec = dfaexec_sb; } static void @@ -3602,6 +3654,7 @@ dfassbuild (struct dfa *d) *sup = *d; sup->multibyte = false; + sup->dfaexec = dfaexec_sb; sup->multibyte_prop = NULL; sup->mbcsets = NULL; sup->superset = NULL; |