aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c207
1 files changed, 134 insertions, 73 deletions
diff --git a/dfa.c b/dfa.c
index 3621676a..fdaaadbd 100644
--- a/dfa.c
+++ b/dfa.c
@@ -58,8 +58,11 @@
/* We can handle multibyte strings. */
#include <wchar.h>
#include <wctype.h>
+
+#if HAVE_LANGINFO_CODESET
# include <langinfo.h>
#endif
+#endif
#include "regex.h"
#include "dfa.h"
@@ -726,33 +729,37 @@ in_coll_range (char ch, char from, char to)
typedef int predicate (int);
+#ifdef GAWK
+#define bool int
+#define true (1)
+#define false (0)
+#endif
/* The following list maps the names of the Posix named character classes
to predicate functions that determine whether a given character is in
the class. The leading [ has already been eaten by the lexical analyzer. */
-static struct {
+struct dfa_ctype {
const char *name;
- predicate *pred;
-} const prednames[] = {
- { "alpha", isalpha },
- { "upper", isupper },
- { "lower", islower },
- { "digit", isdigit },
- { "xdigit", isxdigit },
- { "space", isspace },
- { "punct", ispunct },
- { "alnum", isalnum },
- { "print", isprint },
- { "graph", isgraph },
- { "cntrl", iscntrl },
-#ifdef GAWK
- { "blank", is_blank },
-#else
- { "blank", isblank },
-#endif
- { NULL, NULL }
+ predicate *func;
+ bool single_byte_only;
+};
+
+static const struct dfa_ctype prednames[] = {
+ { "alpha", isalpha, false },
+ { "upper", isupper, false },
+ { "lower", islower, false },
+ { "digit", isdigit, true },
+ { "xdigit", isxdigit, true },
+ { "space", isspace, false },
+ { "punct", ispunct, false },
+ { "alnum", isalnum, false },
+ { "print", isprint, false },
+ { "graph", isgraph, false },
+ { "cntrl", iscntrl, false },
+ { "blank", is_blank, false },
+ { NULL, NULL, false }
};
-static predicate *
+static const struct dfa_ctype *
find_pred (const char *str)
{
unsigned int i;
@@ -760,7 +767,7 @@ find_pred (const char *str)
if (STREQ (str, prednames[i].name))
break;
- return prednames[i].pred;
+ return &prednames[i];
}
/* Multibyte character handling sub-routine for lex.
@@ -773,6 +780,13 @@ parse_bracket_exp (void)
int c, c1, c2;
charclass ccl;
+ /* Used to warn about [:space:].
+ Bit 0 = first character is a colon.
+ Bit 1 = last character is a colon.
+ Bit 2 = includes any other character but a colon.
+ Bit 3 = includes ranges, char/equiv classes or collation elements. */
+ int colon_warning_state;
+
#if MBS_SUPPORT
wint_t wc, wc1, wc2;
@@ -811,9 +825,11 @@ parse_bracket_exp (void)
else
invert = 0;
+ colon_warning_state = (c == ':');
do
{
c1 = EOF; /* mark c1 is not initialized". */
+ colon_warning_state &= ~2;
/* Note that if we're looking at some other [:...:] construct,
we just treat it as a bunch of ordinary characters. We can do
@@ -857,8 +873,12 @@ parse_bracket_exp (void)
|| STREQ (str, "lower"))
? "alpha"
: str);
+ const struct dfa_ctype *pred = find_pred (class);
+ if (!pred)
+ dfaerror(_("invalid character class"));
+
#if MBS_SUPPORT
- if (MB_CUR_MAX > 1)
+ if (MB_CUR_MAX > 1 && !pred->single_byte_only)
{
/* Store the character class as wctype_t. */
wctype_t wt = wctype (class);
@@ -872,14 +892,9 @@ parse_bracket_exp (void)
}
#endif
- {
- predicate *pred = find_pred (class);
- if (!pred)
- dfaerror(_("invalid character class"));
- for (c2 = 0; c2 < NOTCHAR; ++c2)
- if ((*pred)(c2))
- setbit_case_fold (c2, ccl);
- }
+ for (c2 = 0; c2 < NOTCHAR; ++c2)
+ if (pred->func(c2))
+ setbit_case_fold (c2, ccl);
}
#if MBS_SUPPORT
@@ -912,6 +927,7 @@ parse_bracket_exp (void)
}
}
#endif
+ colon_warning_state |= 8;
/* Fetch new lookahead character. */
FETCH_WC (c1, wc1, _("unbalanced ["));
@@ -997,10 +1013,13 @@ parse_bracket_exp (void)
setbit_case_fold (c, ccl);
}
+ colon_warning_state |= 8;
FETCH_WC(c1, wc1, _("unbalanced ["));
continue;
}
+ colon_warning_state |= (c == ':') ? 2 : 4;
+
#if MBS_SUPPORT
/* Build normal characters. */
setbit_case_fold (wc, ccl);
@@ -1040,6 +1059,9 @@ parse_bracket_exp (void)
#endif
(c = c1) != ']'));
+ if (colon_warning_state == 7)
+ dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
+
#if MBS_SUPPORT
if (MB_CUR_MAX > 1
&& (!using_utf8()
@@ -2310,8 +2332,8 @@ dfaanalyze (struct dfa *d, int searchflag)
void
dfastate (int s, struct dfa *d, int trans[])
{
- position_set grps[NOTCHAR]; /* As many as will ever be needed. */
- charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
+ position_set *grps; /* As many as will ever be needed. */
+ charclass *labels; /* Labels corresponding to the groups. */
int ngrps = 0; /* Number of groups actually used. */
position pos; /* Current position being considered. */
charclass matches; /* Set of matching characters. */
@@ -2335,6 +2357,9 @@ dfastate (int s, struct dfa *d, int trans[])
#endif
int i, j, k;
+ grps = xnmalloc (NOTCHAR, sizeof *grps);
+ labels = xnmalloc (NOTCHAR, sizeof *labels);
+
/* Initialize the set of letters, if necessary. */
if (! initialized)
{
@@ -2597,6 +2622,8 @@ dfastate (int s, struct dfa *d, int trans[])
free(grps[i].elems);
free(follows.elems);
free(tmp.elems);
+ free(grps);
+ free(labels);
}
/* Some routines for manipulating a compiled dfa's transition tables.
@@ -3121,6 +3148,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
return s1;
}
+/* Initialize mblen_buf and inputwcs with data from the next line. */
+
+static void
+prepare_wc_buf (const char *begin, const char *end)
+{
+ unsigned char eol = eolbyte;
+ size_t remain_bytes, i;
+
+ buf_begin = (unsigned char *) begin;
+
+ remain_bytes = 0;
+ for (i = 0; i < end - begin + 1; i++)
+ {
+ if (remain_bytes == 0)
+ {
+ remain_bytes
+ = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+ if (remain_bytes < 1
+ || remain_bytes == (size_t) -1
+ || remain_bytes == (size_t) -2
+ || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
+ {
+ remain_bytes = 0;
+ inputwcs[i] = (wchar_t)begin[i];
+ mblen_buf[i] = 0;
+ if (begin[i] == eol)
+ break;
+ }
+ else
+ {
+ mblen_buf[i] = remain_bytes;
+ remain_bytes--;
+ }
+ }
+ else
+ {
+ mblen_buf[i] = remain_bytes;
+ inputwcs[i] = 0;
+ remain_bytes--;
+ }
+ }
+
+ buf_end = (unsigned char *) (begin + i);
+ mblen_buf[i] = 0;
+ inputwcs[i] = 0; /* sentinel */
+}
+
#endif /* MBS_SUPPORT */
/* Search through a buffer looking for a match to the given struct dfa.
@@ -3144,9 +3218,9 @@ dfaexec (struct dfa *d, char const *begin, char *end,
int **trans, *t; /* Copy of d->trans so it can be optimized
into a register. */
unsigned char eol = eolbyte; /* Likewise for eolbyte. */
+ unsigned char saved_end;
static int sbit[NOTCHAR]; /* Table for anding with d->success. */
static int sbit_init;
- unsigned char saved_end;
if (! sbit_init)
{
@@ -3170,44 +3244,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
#if MBS_SUPPORT
if (d->mb_cur_max > 1)
{
- unsigned int i;
- int remain_bytes;
- buf_begin = (unsigned char *) begin;
- buf_end = (unsigned char *) end;
-
- /* initialize mblen_buf, and inputwcs. */
MALLOC(mblen_buf, unsigned char, end - begin + 2);
MALLOC(inputwcs, wchar_t, end - begin + 2);
- memset(&mbs, 0, sizeof mbs);
- remain_bytes = 0;
- for (i = 0; i < end - begin + 1; i++)
- {
- if (remain_bytes == 0)
- {
- remain_bytes
- = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
- if (remain_bytes < 1
- || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
- {
- remain_bytes = 0;
- inputwcs[i] = (wchar_t)begin[i];
- mblen_buf[i] = 0;
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- remain_bytes--;
- }
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- inputwcs[i] = 0;
- remain_bytes--;
- }
- }
- mblen_buf[i] = 0;
- inputwcs[i] = 0; /* sentinel */
+ memset(&mbs, 0, sizeof(mbstate_t));
+ prepare_wc_buf ((const char *) p, end);
}
#endif /* MBS_SUPPORT */
@@ -3217,7 +3257,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (d->mb_cur_max > 1)
while ((t = trans[s]))
{
- if ((char *) p > end)
+ if (p > buf_end)
break;
s1 = s;
SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
@@ -3228,6 +3268,19 @@ dfaexec (struct dfa *d, char const *begin, char *end,
continue;
}
+ /* Falling back to the glibc matcher in this case gives
+ better performance (up to 25% better on [a-z], for
+ example) and enables support for collating symbols and
+ equivalence classes. */
+ if (backref)
+ {
+ *backref = 1;
+ free(mblen_buf);
+ free(inputwcs);
+ *end = saved_end;
+ return (char *) p;
+ }
+
/* Can match with a multibyte character (and multi character
collating element). Transition table might be updated. */
s = transit_state(d, s, &p);
@@ -3277,8 +3330,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
/* If the previous character was a newline, count it. */
- if (count && (char *) p <= end && p[-1] == eol)
- ++*count;
+ if ((char *) p <= end && p[-1] == eol)
+ {
+ if (count)
+ ++*count;
+
+#if MBS_SUPPORT
+ if (d->mb_cur_max > 1)
+ prepare_wc_buf ((const char *) p, end);
+#endif
+ }
/* Check if we've run off the end of the buffer. */
if ((char *) p > end)