aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2016-09-08 05:50:26 +0300
committerArnold D. Robbins <arnold@skeeve.com>2016-09-08 05:50:26 +0300
commita159bf0b87d8bbb576d736ea54c97d0271166620 (patch)
treee55dce2e0a3a02104fb6157b1220cd1e7aade849
parentc49b94108f48ea56f705d8549988c00289a0e2a2 (diff)
downloadegawk-a159bf0b87d8bbb576d736ea54c97d0271166620.tar.gz
egawk-a159bf0b87d8bbb576d736ea54c97d0271166620.tar.bz2
egawk-a159bf0b87d8bbb576d736ea54c97d0271166620.zip
Merge grep dfa.
-rw-r--r--ChangeLog5
-rw-r--r--dfa.c129
-rw-r--r--dfa.h28
-rw-r--r--helpers/ChangeLog4
-rw-r--r--helpers/testdfa.c10
-rw-r--r--re.c3
6 files changed, 90 insertions, 89 deletions
diff --git a/ChangeLog b/ChangeLog
index ecfb2ff0..5542d69e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2016-09-08 Paul Eggert <eggert@cs.ucla.edu>
+
+ * dfa.c, dfa.h: Sync with grep.
+ * re.c (make_regexp): Adjust to DFA API changes.
+
2016-09-08 Arnold D. Robbins <arnold@skeeve.com>
* command.y: Update license text to version 3. Oops.
diff --git a/dfa.c b/dfa.c
index 5d68af23..cf9b2ba8 100644
--- a/dfa.c
+++ b/dfa.c
@@ -59,7 +59,6 @@
#define _(str) gettext (str)
#include <wchar.h>
-#include <wctype.h>
#include "xalloc.h"
@@ -363,6 +362,10 @@ struct regex_syntax
/* Flag for case-folding letters into sets. */
bool case_fold;
+ /* True if ^ and $ match only the start and end of data, and do not match
+ end-of-line within data. */
+ bool anchor;
+
/* End-of-line byte in data. */
unsigned char eolbyte;
@@ -782,7 +785,7 @@ unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
static int
char_context (struct dfa const *dfa, unsigned char c)
{
- if (c == dfa->syntax.eolbyte)
+ if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
return CTX_NEWLINE;
if (unibyte_word_constituent (dfa, c))
return CTX_LETTER;
@@ -2699,18 +2702,9 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
is to fail miserably. */
if (d->searchflag)
{
- /* Find the state(s) corresponding to the positions of state 0. */
- copy (&d->states[0].elems, &follows);
- separate_contexts = state_separate_contexts (&follows);
- state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
- if (separate_contexts & CTX_NEWLINE)
- state_newline = state_index (d, &follows, CTX_NEWLINE);
- else
- state_newline = state;
- if (separate_contexts & CTX_LETTER)
- state_letter = state_index (d, &follows, CTX_LETTER);
- else
- state_letter = state;
+ state_newline = 0;
+ state_letter = d->min_trcount - 1;
+ state = d->initstate_notbol;
for (i = 0; i < NOTCHAR; ++i)
trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
@@ -3075,16 +3069,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
Both P and MBP must be no larger than END. */
static unsigned char const *
skip_remains_mb (struct dfa *d, unsigned char const *p,
- unsigned char const *mbp, char const *end, wint_t *wcp)
+ unsigned char const *mbp, char const *end)
{
- wint_t wc = WEOF;
+ wint_t wc;
if (d->syntax.never_trail[*p])
return p;
while (mbp < p)
mbp += mbs_to_wchar (&wc, (char const *) mbp,
end - (char const *) mbp, d);
- if (wcp != NULL)
- *wcp = wc;
return mbp;
}
@@ -3141,46 +3133,22 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
for (;;)
{
- if (multibyte)
+ while ((t = trans[s]) != NULL)
{
- while ((t = trans[s]) != NULL)
+ if (s < d->min_trcount)
{
- s1 = s;
-
- if (s < d->min_trcount)
+ if (!multibyte || d->states[s].mbps.nelem == 0)
{
- if (d->min_trcount == 1)
- {
- if (d->states[s].mbps.nelem == 0)
- {
- do
- {
- while (t[*p] == 0)
- p++;
- p = mbp = skip_remains_mb (d, p, mbp, end, NULL);
- }
- while (t[*p] == 0);
- }
- else
- p = mbp = skip_remains_mb (d, p, mbp, end, NULL);
- }
- else
- {
- wint_t wc;
- mbp = skip_remains_mb (d, p, mbp, end, &wc);
-
- /* If d->min_trcount is greater than 1, maybe
- transit to another initial state after skip. */
- if (p < mbp)
- {
- /* It's CTX_LETTER or CTX_NONE. CTX_NEWLINE
- cannot happen, as we assume that a newline
- is always a single byte character. */
- s1 = s = d->initstate_notbol;
- p = mbp;
- }
- }
+ while (t[*p] == s)
+ p++;
}
+ if (multibyte)
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+ }
+
+ if (multibyte)
+ {
+ s1 = s;
if (d->states[s].mbps.nelem == 0
|| d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
@@ -3196,22 +3164,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
trans = d->trans;
}
}
- }
- else
- {
- if (s == 0)
- {
- t = trans[s];
- if (t)
- {
- while (t[*p] == 0)
- p++;
- s1 = 0;
- s = t[*p++];
- }
- }
-
- while ((t = trans[s]) != NULL)
+ else
{
s1 = t[*p++];
t = trans[s1];
@@ -3222,6 +3175,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
s1 = tmp; /* swap */
break;
}
+ if (s < d->min_trcount)
+ {
+ while (t[*p] == s1)
+ p++;
+ }
s = t[*p++];
}
}
@@ -3239,19 +3197,25 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
nlcount++;
mbp = p;
- s = allow_nl ? d->newlines[s1] : 0;
+ s = (allow_nl ? d->newlines[s1]
+ : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
+ : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
+ : d->initstate_notbol);
}
else if (d->fails[s])
{
- if (d->success[s] & d->syntax.sbit[*p])
+ if ((d->success[s] & d->syntax.sbit[*p])
+ || ((char *) p == end
+ && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s,
+ *d)))
goto done;
+ if (multibyte && s < d->min_trcount)
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+
s1 = s;
if (!multibyte || d->states[s].mbps.nelem == 0
- || (*p == eol && !allow_nl)
- || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE))
- || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL))
- || (char *) p >= end)
+ || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
{
/* If a input character does not match ANYCHAR, do it
like a single-byte character. */
@@ -3813,9 +3777,11 @@ dfamust (struct dfa const *d)
bool exact = false;
bool begline = false;
bool endline = false;
+ size_t rj;
bool need_begline = false;
bool need_endline = false;
bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
+ struct dfamust *dm;
for (ri = 0; ri < d->tindex; ++ri)
{
@@ -3992,7 +3958,7 @@ dfamust (struct dfa const *d)
}
}
- size_t rj = ri + 2;
+ rj = ri + 2;
if (d->tokens[ri + 1] == CAT)
{
for (; rj < d->tindex - 1; rj += 2)
@@ -4021,7 +3987,7 @@ dfamust (struct dfa const *d)
}
done:;
- struct dfamust *dm = NULL;
+ dm = NULL;
if (*result)
{
dm = xmalloc (sizeof *dm);
@@ -4057,7 +4023,7 @@ dfaalloc (void)
/* Initialize DFA. */
void
dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
- reg_syntax_t bits, bool fold, unsigned char eol)
+ reg_syntax_t bits, int dfaopts)
{
int i;
memset (dfa, 0, offsetof (struct dfa, dfaexec));
@@ -4070,9 +4036,10 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
dfa->canychar = -1;
dfa->lex.cur_mb_len = 1;
dfa->syntax.syntax_bits_set = true;
+ dfa->syntax.case_fold = (dfaopts & DFA_CASE_FOLD) != 0;
+ dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
+ dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
dfa->syntax.syntax_bits = bits;
- dfa->syntax.case_fold = fold;
- dfa->syntax.eolbyte = eol;
for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
{
diff --git a/dfa.h b/dfa.h
index 1fd37ec9..8608b108 100644
--- a/dfa.h
+++ b/dfa.h
@@ -26,7 +26,11 @@
#endif /* HAVE_STDBOOL_H */
#include <stddef.h>
-#define _GL_ATTRIBUTE_MALLOC
+#if 3 <= __GNUC__
+# define _GL_ATTRIBUTE_MALLOC __attribute__ ((__malloc__))
+#else
+# define _GL_ATTRIBUTE_MALLOC
+#endif
struct localeinfo; /* See localeinfo.h. */
@@ -50,15 +54,29 @@ struct dfa;
calling dfafree() on it. */
extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC;
+/* DFA options that can be ORed together, for dfasyntax's 4th arg. */
+enum
+ {
+ /* ^ and $ match only the start and end of data, and do not match
+ end-of-line within data. This is always false for grep, but
+ possibly true for other apps. */
+ DFA_ANCHOR = 1 << 0,
+
+ /* Ignore case while matching. */
+ DFA_CASE_FOLD = 1 << 1,
+
+ /* '\0' in data is end-of-line, instead of the traditional '\n'. */
+ DFA_EOL_NUL = 1 << 2
+ };
+
/* Initialize or reinitialize a DFA. This must be called before
any of the routines below. The arguments are:
1. The DFA to operate on.
2. Information about the current locale.
- 3. The syntax bits described earlier in this file.
- 4. The case-folding flag.
- 5. The line terminator. */
+ 3. Syntax bits described in regex.h.
+ 4. Additional DFA options described above. */
extern void dfasyntax (struct dfa *, struct localeinfo const *,
- reg_syntax_t, bool, unsigned char);
+ reg_syntax_t, int);
/* Build and return the struct dfamust from the given struct dfa. */
extern struct dfamust *dfamust (struct dfa const *);
diff --git a/helpers/ChangeLog b/helpers/ChangeLog
index 0958a021..47062233 100644
--- a/helpers/ChangeLog
+++ b/helpers/ChangeLog
@@ -1,3 +1,7 @@
+2016-09-08 Paul Eggert <eggert@cs.ucla.edu>
+
+ * testdfa.c: Adjust to DFA API changes.
+
2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
* 4.1.4: Release tar ball made.
diff --git a/helpers/testdfa.c b/helpers/testdfa.c
index 4495e11a..fa7715f9 100644
--- a/helpers/testdfa.c
+++ b/helpers/testdfa.c
@@ -44,6 +44,7 @@
#define _Noreturn
#define _GL_ATTRIBUTE_PURE
#include "dfa.h"
+#include "localeinfo.h"
const char *regexflags2str(int flags);
char *databuf(int fd);
@@ -71,7 +72,8 @@ void usage(const char *myname)
int main(int argc, char **argv)
{
- int c, ret, try_backref;
+ int c, ret;
+ bool try_backref;
struct re_pattern_buffer pat;
struct re_registers regs;
struct dfa *dfareg;
@@ -84,6 +86,7 @@ int main(int argc, char **argv)
char save;
size_t count = 0;
char *place;
+ struct localeinfo localeinfo;
if (argc < 2)
usage(argv[0]);
@@ -158,7 +161,6 @@ int main(int argc, char **argv)
dfa_syn = syn;
if (ignorecase)
dfa_syn |= RE_ICASE;
- dfasyntax(dfa_syn, ignorecase, '\n');
re_set_syntax(syn);
if ((rerr = re_compile_pattern(pattern, len, & pat)) != NULL) {
@@ -171,6 +173,10 @@ int main(int argc, char **argv)
pat.newline_anchor = false; /* don't get \n in middle of string */
dfareg = dfaalloc();
+ init_localeinfo(&localeinfo);
+ dfasyntax(dfareg, &localeinfo, dfa_syn,
+ ignorecase ? DFA_CASE_FOLD : 0);
+
printf("Calling dfacomp(%s, %d, %p, true)\n",
pattern, (int) len, dfareg);
diff --git a/re.c b/re.c
index 6a100db0..69cc50e1 100644
--- a/re.c
+++ b/re.c
@@ -227,7 +227,8 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
rp->pat.newline_anchor = false; /* don't get \n in middle of string */
if (dfa && ! no_dfa) {
rp->dfareg = dfaalloc();
- dfasyntax(rp->dfareg, & localeinfo, dfa_syn, ignorecase, '\n');
+ dfasyntax(rp->dfareg, & localeinfo, dfa_syn,
+ ignorecase ? DFA_CASE_FOLD : 0);
dfacomp(buf, len, rp->dfareg, true);
} else
rp->dfareg = NULL;