aboutsummaryrefslogtreecommitdiffstats
path: root/re.c
diff options
context:
space:
mode:
Diffstat (limited to 're.c')
-rw-r--r--re.c68
1 files changed, 28 insertions, 40 deletions
diff --git a/re.c b/re.c
index 878c884e..167a265d 100644
--- a/re.c
+++ b/re.c
@@ -25,10 +25,14 @@
#include "awk.h"
+#include "localeinfo.h"
+
static reg_syntax_t syn;
static void check_bracket_exp(char *s, size_t len);
const char *regexflags2str(int flags);
+static struct localeinfo localeinfo;
+
/* make_regexp --- generate compiled regular expressions */
Regexp *
@@ -45,7 +49,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
int c, c2;
static bool first = true;
static bool no_dfa = false;
- bool has_anchor = false;
reg_syntax_t dfa_syn;
int i;
@@ -75,10 +78,10 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
* from that.
*/
if (buf == NULL) {
- emalloc(buf, char *, len + 2, "make_regexp");
+ emalloc(buf, char *, len + 1, "make_regexp");
buflen = len;
} else if (len > buflen) {
- erealloc(buf, char *, len + 2, "make_regexp");
+ erealloc(buf, char *, len + 1, "make_regexp");
buflen = len;
}
dest = buf;
@@ -156,9 +159,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
} /* switch */
} else {
c = *src;
- if (c == '^' || c == '$')
- has_anchor = true;
-
*dest++ = *src++; /* not '\\' */
}
if (gawk_mb_cur_max > 1 && is_multibyte)
@@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
memset((char *) rp, 0, sizeof(*rp));
- rp->dfareg = NULL;
rp->pat.allocated = 0; /* regex will allocate the buffer */
emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
@@ -223,13 +222,12 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
/* gack. this must be done *after* re_compile_pattern */
rp->pat.newline_anchor = false; /* don't get \n in middle of string */
if (dfa && ! no_dfa) {
- rp->dfa = true;
rp->dfareg = dfaalloc();
- dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
+ dfasyntax(rp->dfareg, & localeinfo, dfa_syn,
+ (ignorecase ? DFA_CASE_FOLD : 0) | DFA_ANCHOR);
dfacomp(buf, len, rp->dfareg, true);
} else
- rp->dfa = false;
- rp->has_anchor = has_anchor;
+ rp->dfareg = NULL;
/* Additional flags that help with RS as regexp. */
for (i = 0; i < len; i++) {
@@ -278,26 +276,24 @@ research(Regexp *rp, char *str, int start,
* starts in the middle of a string, so don't bother trying it
* in that case.
*/
- if (rp->dfa && ! no_bol && start == 0) {
- char save;
- size_t count = 0;
+ if (rp->dfareg != NULL && ! no_bol && start == 0) {
struct dfa *superset = dfasuperset(rp->dfareg);
- /*
- * dfa likes to stick a '\n' right after the matched
- * text. So we just save and restore the character.
- */
- save = str[start+len];
if (superset)
ret = dfaexec(superset, str+start, str+start+len,
true, NULL, NULL);
- if (ret)
+
+ if (ret && (! need_start
+ || (! superset && dfaisfast(rp->dfareg))))
ret = dfaexec(rp->dfareg, str+start, str+start+len,
- true, &count, &try_backref);
- str[start+len] = save;
+ true, NULL, &try_backref);
}
if (ret) {
- if (need_start || rp->dfa == false || try_backref) {
+ if ( rp->dfareg == NULL
+ || start != 0
+ || no_bol
+ || need_start
+ || try_backref) {
/*
* Passing NULL as last arg speeds up search for cases
* where we don't need the start/end info.
@@ -326,7 +322,7 @@ refree(Regexp *rp)
free(rp->regs.start);
if (rp->regs.end)
free(rp->regs.end);
- if (rp->dfa) {
+ if (rp->dfareg != NULL) {
dfafree(rp->dfareg);
free(rp->dfareg);
}
@@ -359,7 +355,7 @@ re_update(NODE *t)
t1 = t->re_exp;
if (t->re_text != NULL) {
/* if contents haven't changed, just return it */
- if (cmp_nodes(t->re_text, t1) == 0)
+ if (cmp_nodes(t->re_text, t1, true) == 0)
return t->re_reg;
/* things changed, fall through to recompile */
unref(t->re_text);
@@ -398,6 +394,9 @@ re_update(NODE *t)
void
resetup()
{
+ // init localeinfo for dfa
+ init_localeinfo(& localeinfo);
+
/*
* Syntax bits: _that_ is yet another mind trip. Recreational drugs
* are helpful for recovering from the experience.
@@ -421,25 +420,14 @@ resetup()
syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
(void) re_set_syntax(syn);
-
- dfa_init();
}
-/* avoid_dfa --- return true if we should not use the DFA matcher */
+/* using_utf8 --- are we using utf8 */
-int
-avoid_dfa(NODE *re, char *str, size_t len)
+bool
+using_utf8(void)
{
- char *end;
-
- if (! re->re_reg->has_anchor)
- return false;
-
- for (end = str + len; str < end; str++)
- if (*str == '\n')
- return true;
-
- return false;
+ return localeinfo.using_utf8;
}
/* reisstring --- return true if the RE match is a simple string match */