aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2014-10-13 11:27:13 +0300
committerArnold D. Robbins <arnold@skeeve.com>2014-10-13 11:27:13 +0300
commit0485d6bfe2417a7640ef95c9de6f48e1f35003fd (patch)
treef26c8a24a4a12bf9be5038f40e6a20a4f6e9aa79 /dfa.c
parent7504a8fbc86b327ad07c79c943b8fe2d253f256d (diff)
parent2a8c128ca91b42261720368e5d25431ee4362c70 (diff)
downloadegawk-0485d6bfe2417a7640ef95c9de6f48e1f35003fd.tar.gz
egawk-0485d6bfe2417a7640ef95c9de6f48e1f35003fd.tar.bz2
egawk-0485d6bfe2417a7640ef95c9de6f48e1f35003fd.zip
Merge branch 'master' into cmake
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c162
1 files changed, 112 insertions, 50 deletions
diff --git a/dfa.c b/dfa.c
index 1a153785..f810a262 100644
--- a/dfa.c
+++ b/dfa.c
@@ -77,14 +77,6 @@ is_blank (int c)
}
#endif /* GAWK */
-#ifdef LIBC_IS_BORKED
-extern int gawk_mb_cur_max;
-#undef MB_CUR_MAX
-#define MB_CUR_MAX gawk_mb_cur_max
-#undef mbrtowc
-#define mbrtowc(a, b, c, d) (-1)
-#endif
-
/* HPUX defines these as macros in sys/param.h. */
#ifdef setbit
# undef setbit
@@ -375,6 +367,9 @@ struct dfa
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
+ /* dfaexec implementation. */
+ char *(*dfaexec) (struct dfa *, char const *, char *, int, size_t *, int *);
+
/* The following are valid only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule.
@@ -828,10 +823,6 @@ using_utf8 (void)
wchar_t wc;
mbstate_t mbs = { 0 };
utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-#ifdef LIBC_IS_BORKED
- if (gawk_mb_cur_max == 1)
- utf8 = 0;
-#endif
}
return utf8;
}
@@ -3321,6 +3312,24 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
return s1;
}
+/* The initial state may encounter a byte which is not a single byte character
+ nor the first byte of a multibyte character. But it is incorrect for the
+ initial state to accept such a byte. For example, in Shift JIS the regular
+ expression "\\" accepts the codepoint 0x5c, but should not accept the second
+ byte of the codepoint 0x815c. Then the initial state must skip the bytes
+ that are not a single byte character nor the first byte of a multibyte
+ character. */
+static unsigned char const *
+skip_remains_mb (struct dfa *d, unsigned char const *p,
+ unsigned char const *mbp, char const *end)
+{
+ wint_t wc;
+ while (mbp < p)
+ mbp += mbs_to_wchar (&wc, (char const *) mbp,
+ end - (char const *) mbp, d);
+ return mbp;
+}
+
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
buffer, and the shortest possible version thereof. Return a pointer to
@@ -3332,10 +3341,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
If COUNT is non-NULL, increment *COUNT once for each newline processed.
Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
encountered a back-reference (1) or not (0). The caller may use this
- to decide whether to fall back on a backtracking matcher. */
-char *
-dfaexec (struct dfa *d, char const *begin, char *end,
- int allow_nl, size_t *count, int *backref)
+ to decide whether to fall back on a backtracking matcher.
+
+ If MULTIBYTE, the input consists of multibyte characters and/or
+ encoding-error bytes. Otherwise, the input consists of single-byte
+ characters. */
+static inline char *
+dfaexec_main (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref, bool multibyte)
{
state_num s, s1; /* Current state. */
unsigned char const *p, *mbp; /* Current input character. */
@@ -3357,7 +3370,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
saved_end = *(unsigned char *) end;
*end = eol;
- if (d->multibyte)
+ if (multibyte)
{
memset (&d->mbs, 0, sizeof d->mbs);
if (! d->mb_match_lens)
@@ -3369,7 +3382,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
for (;;)
{
- if (d->multibyte)
+ if (multibyte)
{
while ((t = trans[s]) != NULL)
{
@@ -3377,27 +3390,18 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (s == 0)
{
- /* The initial state may encounter a byte which is not
- a single byte character nor the first byte of a
- multibyte character. But it is incorrect for the
- initial state to accept such a byte. For example,
- in Shift JIS the regular expression "\\" accepts
- the codepoint 0x5c, but should not accept the second
- byte of the codepoint 0x815c. Then the initial
- state must skip the bytes that are not a single
- byte character nor the first byte of a multibyte
- character. */
- wint_t wc;
- while (mbp < p)
- mbp += mbs_to_wchar (&wc, (char const *) mbp,
- end - (char const *) mbp, d);
- p = mbp;
-
- if ((char *) p > end)
+ if (d->states[s].mbps.nelem == 0)
{
- p = NULL;
- goto done;
+ do
+ {
+ while (t[*p] == 0)
+ p++;
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+ }
+ while (t[*p] == 0);
}
+ else
+ p = mbp = skip_remains_mb (d, p, mbp, end);
}
if (d->states[s].mbps.nelem == 0)
@@ -3416,15 +3420,49 @@ dfaexec (struct dfa *d, char const *begin, char *end,
goto done;
}
- /* Can match with a multibyte character (and multi character
- collating element). Transition table might be updated. */
- s = transit_state (d, s, &p, (unsigned char *) end);
- mbp = p;
- trans = d->trans;
+ /* The following code is used twice.
+ Use a macro to avoid the risk that they diverge. */
+#define State_transition() \
+ do { \
+ /* Can match with a multibyte character (and multi-character \
+ collating element). Transition table might be updated. */ \
+ s = transit_state (d, s, &p, (unsigned char *) end); \
+ \
+ /* If previous character is newline after a transition \
+ for ANYCHAR or MBCSET in non-UTF8 multibyte locales, \
+ check whether current position is beyond the end of \
+ the input buffer. Also, transit to initial state if \
+ !ALLOW_NL, even if RE_DOT_NEWLINE is set. */ \
+ if (p[-1] == eol) \
+ { \
+ if ((char *) p > end) \
+ { \
+ p = NULL; \
+ goto done; \
+ } \
+ \
+ nlcount++; \
+ \
+ if (!allow_nl) \
+ s = 0; \
+ } \
+ \
+ mbp = p; \
+ trans = d->trans; \
+ } while (0)
+
+ State_transition();
}
}
else
{
+ if (s == 0 && (t = trans[s]) != NULL)
+ {
+ while (t[*p] == 0)
+ p++;
+ s = t[*p++];
+ }
+
while ((t = trans[s]) != NULL)
{
s1 = t[*p++];
@@ -3455,14 +3493,8 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
s1 = s;
- if (d->multibyte)
- {
- /* Can match with a multibyte character (and multicharacter
- collating element). Transition table might be updated. */
- s = transit_state (d, s, &p, (unsigned char *) end);
- mbp = p;
- trans = d->trans;
- }
+ if (multibyte)
+ State_transition();
else
s = d->fails[s][*p++];
continue;
@@ -3500,6 +3532,33 @@ dfaexec (struct dfa *d, char const *begin, char *end,
return (char *) p;
}
+/* Specialized versions of dfaexec_main for multibyte and single-byte
+ cases. This is for performance. */
+
+static char *
+dfaexec_mb (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref)
+{
+ return dfaexec_main (d, begin, end, allow_nl, count, backref, true);
+}
+
+static char *
+dfaexec_sb (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref)
+{
+ return dfaexec_main (d, begin, end, allow_nl, count, backref, false);
+}
+
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte),
+ but faster. */
+
+char *
+dfaexec (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref)
+{
+ return d->dfaexec (d, begin, end, allow_nl, count, backref);
+}
+
struct dfa *
dfasuperset (struct dfa const *d)
{
@@ -3549,6 +3608,7 @@ dfainit (struct dfa *d)
{
memset (d, 0, sizeof *d);
d->multibyte = MB_CUR_MAX > 1;
+ d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
d->fast = !d->multibyte;
}
@@ -3589,6 +3649,7 @@ dfaoptimize (struct dfa *d)
free_mbdata (d);
d->multibyte = false;
+ d->dfaexec = dfaexec_sb;
}
static void
@@ -3602,6 +3663,7 @@ dfassbuild (struct dfa *d)
*sup = *d;
sup->multibyte = false;
+ sup->dfaexec = dfaexec_sb;
sup->multibyte_prop = NULL;
sup->mbcsets = NULL;
sup->superset = NULL;