summaryrefslogtreecommitdiffstats
path: root/regex.c
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2010-01-06 15:40:49 -0800
committerKaz Kylheku <kaz@kylheku.com>2010-01-06 15:40:49 -0800
commitc017faf62d69d43219f7d1d651f7a46083f8a6a4 (patch)
tree5f1dfb8a3af86e952800962a707e3ba0a0cf1010 /regex.c
parent18fa11c511ba1b600b43b7196d72f732e4f3ccf5 (diff)
downloadtxr-c017faf62d69d43219f7d1d651f7a46083f8a6a4.tar.gz
txr-c017faf62d69d43219f7d1d651f7a46083f8a6a4.tar.bz2
txr-c017faf62d69d43219f7d1d651f7a46083f8a6a4.zip
Remove incorrect implementation of extended
regex operations (complement, intersection). The syntax extensions documentation are retained.
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c305
1 files changed, 32 insertions, 273 deletions
diff --git a/regex.c b/regex.c
index 5cbe11a0..ec1e656c 100644
--- a/regex.c
+++ b/regex.c
@@ -71,20 +71,17 @@ typedef cset_L2_t *cset_L3_t[17];
struct any_char_set {
unsigned type : 3;
unsigned comp : 1;
- unsigned refs : 12;
};
struct small_char_set {
unsigned type : 3;
unsigned comp : 1;
- unsigned refs : 12;
cset_L0_t bitcell;
};
struct displaced_char_set {
unsigned type : 3;
unsigned comp : 1;
- unsigned refs : 12;
cset_L0_t bitcell;
wchar_t base;
};
@@ -93,14 +90,12 @@ struct displaced_char_set {
struct large_char_set {
unsigned type : 3;
unsigned comp : 1;
- unsigned refs : 12;
cset_L2_t dir;
};
struct xlarge_char_set {
unsigned type : 3;
unsigned comp : 1;
- unsigned refs : 12;
cset_L3_t dir;
};
@@ -114,38 +109,15 @@ typedef union char_set {
#define NFA_SET_SIZE 512
-/*
- * NFA node types, where each node represents a state, plus up to
- * either two epsilon-transitions, or a single transition for a character, *
- * character class or wild (match any character). The types starting
- * with nfa_reject are involved in special representations needed
- * for complement NFA's; these only occur if a regex contains the
- * complement operator.
- */
-
typedef enum {
- nfa_accept, nfa_empty, nfa_wild,
- nfa_single, nfa_set, nfa_super_accept,
- nfa_reject, nfa_compl_empty, nfa_compl_wild,
- nfa_compl_single, nfa_compl_set
+ nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set
} nfa_kind_t;
-/*
- * Node used for types nfa_accept, nfa_reject, nfa_super_accept.
- *
- * NOTE: nfa_super_accept is globally instantiated; there is
- * only one node of this type in the system. Transitions to it
- * are implicit. Its visit flag is never used, so there is no
- * reentrancy issue.
- */
struct nfa_state_accept {
nfa_kind_t kind;
unsigned visited;
};
-/*
- * Node used for nfa_empty, nfa_compl_empty.
- */
struct nfa_state_empty {
nfa_kind_t kind;
unsigned visited;
@@ -153,20 +125,13 @@ struct nfa_state_empty {
nfa_state_t *trans1;
};
-/*
- * Node used for nfa_single, nfa_wild,
- * nfa_compl_single, nfa_compl_wild.
- */
struct nfa_state_single {
nfa_kind_t kind;
unsigned visited;
nfa_state_t *trans;
- wchar_t ch; /* set to 0 if nfa_state_wild or nfa_compl_wild. */
+ wchar_t ch;
};
-/*
- * Node used for nfa_set, nfa_compl_set.
- */
struct nfa_state_set {
nfa_kind_t kind;
unsigned visited;
@@ -190,23 +155,6 @@ struct nfa_machine {
nfa_t nfa;
};
-static nfa_state_t nfa_super_accept_state = { { nfa_super_accept } };
-
-INLINE int nfa_is_accept_state(nfa_state_t *state)
-{
- switch (state->a.kind) {
- case nfa_accept:
- case nfa_super_accept:
- case nfa_compl_empty:
- case nfa_compl_wild:
- case nfa_compl_single:
- case nfa_compl_set:
- return 1;
- default:
- return 0;
- }
-}
-
static int L0_full(cset_L0_t *L0)
{
int i;
@@ -475,7 +423,6 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs);
*cs = blank;
cs->any.type = type;
- cs->any.refs = 1;
if (type == CHSET_DISPLACED)
cs->d.base = base;
@@ -485,30 +432,22 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
static void char_set_destroy(char_set_t *set)
{
- if (--set->any.refs == 0) {
- switch (set->any.type) {
- case CHSET_DISPLACED:
- case CHSET_SMALL:
- free(set);
- break;
- case CHSET_LARGE:
- L2_free(&set->l.dir);
- free(set);
- break;
- case CHSET_XLARGE:
- L3_free(&set->xl.dir);
- free(set);
- break;
- }
+ switch (set->any.type) {
+ case CHSET_DISPLACED:
+ case CHSET_SMALL:
+ free(set);
+ break;
+ case CHSET_LARGE:
+ L2_free(&set->l.dir);
+ free(set);
+ break;
+ case CHSET_XLARGE:
+ L3_free(&set->xl.dir);
+ free(set);
+ break;
}
}
-static char_set_t *char_set_clone(char_set_t *set)
-{
- set->any.refs++;
- return set;
-}
-
static void char_set_compl(char_set_t *set)
{
set->any.comp = 1;
@@ -663,7 +602,7 @@ static nfa_state_t *nfa_state_set(nfa_state_t *t, char_set_t *cs)
static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
nfa_state_t *t1)
{
- assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject);
+ assert (acc->a.kind == nfa_accept);
acc->e.kind = nfa_empty;
acc->e.trans0 = t0;
acc->e.trans1 = t1;
@@ -685,7 +624,7 @@ static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
*/
static void nfa_state_merge(nfa_state_t *acc, nfa_state_t *st)
{
- assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject);
+ assert (acc->a.kind == nfa_accept);
*acc = *st;
}
@@ -784,136 +723,6 @@ static nfa_t nfa_compile_set(val args, int comp)
}
}
-static nfa_state_t *nfa_compl_state(nfa_state_t *in)
-{
- nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st);
-
- st->a.visited = 0;
-
- switch (in->a.kind) {
- case nfa_accept:
- st->a.kind = nfa_reject;
- break;
- case nfa_empty:
- st->a.kind = nfa_compl_empty;
- st->e.trans0 = 0;
- st->e.trans1 = 0;
- break;
- case nfa_wild:
- st->a.kind = nfa_compl_wild;
- st->o.trans = 0;
- st->o.ch = 0;
- break;
- case nfa_single:
- st->a.kind = nfa_compl_single;
- st->o.trans = 0;
- st->o.ch = in->o.ch;
- break;
- case nfa_set:
- st->a.kind = nfa_compl_set;
- st->s.trans = 0;
- st->s.set = char_set_clone(in->s.set);
- break;
- case nfa_super_accept:
- internal_error("NFA super accept state explicitly reachable in graph");
- case nfa_reject:
- st->a.kind = nfa_accept;
- break;
- case nfa_compl_empty:
- st->a.kind = nfa_empty;
- st->e.trans0 = 0;
- st->e.trans1 = 0;
- break;
- case nfa_compl_wild:
- st->a.kind = nfa_wild;
- st->o.trans = 0;
- st->o.ch = 0;
- break;
- case nfa_compl_single:
- st->a.kind = nfa_single;
- st->o.trans = 0;
- st->o.ch = in->o.ch;
- break;
- case nfa_compl_set:
- st->a.kind = nfa_set;
- st->s.trans = 0;
- st->s.set = char_set_clone(in->s.set);
- break;
- }
-
- return st;
-}
-
-static nfa_t nfa_compl(nfa_t in)
-{
- nfa_state_t **stack = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *stack);
- nfa_state_t **temp = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *temp);
- unsigned visited = ++in.start->a.visited;
- int i, num = 2;
-
- stack[0] = in.start;
- stack[1] = in.accept;
-
- temp[0] = nfa_compl_state(stack[0]);
- temp[1] = nfa_compl_state(stack[1]);
-
- for (i = 0; i < num; i++) {
- nfa_state_t *s = stack[i];
- nfa_state_t *t = temp[i];
-
- if (num >= NFA_SET_SIZE)
- internal_error("NFA set size exceeded");
-
- switch (s->a.kind) {
- case nfa_accept:
- case nfa_reject:
- break;
- case nfa_empty:
- case nfa_compl_empty:
- {
- nfa_state_t *e0 = s->e.trans0;
- nfa_state_t *e1 = s->e.trans1;
-
- if (e0 && e0->a.visited != visited) {
- e0->a.visited = visited;
- temp[num] = t->e.trans0 = nfa_compl_state(e0);
- stack[num++] = e0;
- }
- if (e1 && e1->a.visited != visited) {
- e1->a.visited = visited;
- temp[num] = t->e.trans1 = nfa_compl_state(e1);
- stack[num++] = e1;
- }
- }
- break;
- case nfa_wild:
- case nfa_single:
- case nfa_set:
- case nfa_compl_wild:
- case nfa_compl_single:
- case nfa_compl_set:
- if (s->o.trans->a.visited != visited) {
- s->o.trans->a.visited = visited;
- temp[num] = t->o.trans = nfa_compl_state(s->o.trans);
- stack[num++] = s->o.trans;
- }
- break;
- case nfa_super_accept:
- internal_error("NFA super accept state explicitly reachable in graph");
- }
- }
-
- if (num > NFA_SET_SIZE)
- internal_error("NFA set size exceeded");
-
- {
- nfa_t ret = { temp[0], temp[1] };
- free(temp);
- free(stack);
- return ret;
- }
-}
-
/*
* Input is the items from a regex form,
* not including the regex symbol.
@@ -976,9 +785,6 @@ nfa_t nfa_compile_regex(val items)
nfa_t nfa_args = nfa_compile_regex(args);
nfa_state_t *s = nfa_state_empty(nfa_args.start, nfa_args.accept);
nfa = nfa_make(s, nfa_args.accept);
- } else if (sym == compl_s) {
- nfa_t nfa_args = nfa_compile_regex(args);
- nfa = nfa_compl(nfa_args);
} else if (sym == or_s) {
/* Simple: make a new start and acceptance state, which form
the ends of a spindle that goes through two branches. */
@@ -992,19 +798,6 @@ nfa_t nfa_compile_regex(val items)
nfa_state_empty_convert(nfa_first.accept, acc, 0);
nfa_state_empty_convert(nfa_second.accept, acc, 0);
nfa = nfa_make(s, acc);
- } else if (sym == and_s) {
- /* We compile regex and using the De Morgan's equality:
- R1&R2 == ~(~R1|~R2).
- We could do this by transforming the source code,
- and recursing into compiler, but instead, the approach taken
- is NFA manipulation closely based on the or_s case above. */
- nfa_t nfa_first = nfa_compl(nfa_compile_regex(first(args)));
- nfa_t nfa_second = nfa_compl(nfa_compile_regex(second(args)));
- nfa_state_t *acc = nfa_state_accept();
- nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start);
- nfa_state_empty_convert(nfa_first.accept, acc, 0);
- nfa_state_empty_convert(nfa_second.accept, acc, 0);
- nfa = nfa_compl(nfa_make(s, acc));
} else {
assert (0 && "internal error: bad operator in regex");
}
@@ -1024,52 +817,45 @@ nfa_t nfa_compile_regex(val items)
}
}
-static int nfa_all_states(nfa_state_t **stack, int num, unsigned visited)
+static int nfa_all_states(nfa_state_t **inout, int num, unsigned visited)
{
int i;
for (i = 0; i < num; i++)
- stack[i]->a.visited = visited;
+ inout[i]->a.visited = visited;
for (i = 0; i < num; i++) {
- nfa_state_t *s = stack[i];
+ nfa_state_t *s = inout[i];
if (num >= NFA_SET_SIZE)
internal_error("NFA set size exceeded");
switch (s->a.kind) {
case nfa_accept:
- case nfa_reject:
break;
case nfa_empty:
- case nfa_compl_empty:
{
nfa_state_t *e0 = s->e.trans0;
nfa_state_t *e1 = s->e.trans1;
if (e0 && e0->a.visited != visited) {
e0->a.visited = visited;
- stack[num++] = e0;
+ inout[num++] = e0;
}
if (e1 && e1->a.visited != visited) {
e1->a.visited = visited;
- stack[num++] = e1;
+ inout[num++] = e1;
}
}
break;
case nfa_wild:
case nfa_single:
case nfa_set:
- case nfa_compl_wild:
- case nfa_compl_single:
- case nfa_compl_set:
if (s->o.trans->a.visited != visited) {
s->o.trans->a.visited = visited;
- stack[num++] = s->o.trans;
+ inout[num++] = s->o.trans;
}
break;
- case nfa_super_accept:
- internal_error("NFA super accept state explicitly reachable in graph");
}
}
@@ -1087,7 +873,7 @@ void nfa_free(nfa_t nfa)
all[0] = nfa.start;
all[1] = nfa.accept;
- nstates = nfa_all_states(all, 2, nfa.start->a.visited + 1);
+ nstates = nfa_all_states(all, 2, nfa.start->a.visited);
for (i = 0; i < nstates; i++)
nfa_state_free(all[i]);
@@ -1120,15 +906,10 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
for (i = 0; i < nin; i++) {
if (stackp >= NFA_SET_SIZE)
internal_error("NFA set size exceeded");
- /* We do not mark the super accept state visited,
- and we will never visit it by following e-transitions,
- because all transitions into the super-accept state
- are character-consuming. */
- if (in[i] != &nfa_super_accept_state)
- in[i]->a.visited = visited;
+ in[i]->a.visited = visited;
stack[stackp++] = in[i];
out[nout++] = in[i];
- if (nfa_is_accept_state(in[i]))
+ if (in[i]->a.kind == nfa_accept)
*accept = 1;
}
@@ -1138,13 +919,10 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
if (nout >= NFA_SET_SIZE)
internal_error("NFA set size exceeded");
- /* Only states of type nfa_empty or its complement are interesting.
- Each such state at most two epsilon transitions.
- Note above comment that epsilon transitions never lead
- to the special super accept state; i.e. e0 and e1
- are never the super accept state and so we can write to their
- their visited field. */
- if (top->a.kind == nfa_empty || top->a.kind == nfa_compl_empty) {
+ /* Only states of type nfa_empty are interesting.
+ Each such state at most two epsilon transitions. */
+
+ if (top->a.kind == nfa_empty) {
nfa_state_t *e0 = top->e.trans0;
nfa_state_t *e1 = top->e.trans1;
@@ -1152,7 +930,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
e0->a.visited = visited;
stack[stackp++] = e0;
out[nout++] = e0;
- if (nfa_is_accept_state(e0))
+ if (e0->a.kind == nfa_accept)
*accept = 1;
}
@@ -1160,7 +938,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
e1->a.visited = visited;
stack[stackp++] = e1;
out[nout++] = e1;
- if (nfa_is_accept_state(e1))
+ if (e1->a.kind == nfa_accept)
*accept = 1;
}
}
@@ -1180,41 +958,22 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
static int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, wchar_t ch)
{
int i, nmove;
- int super_added = 0;
for (nmove = 0, i = 0; i < nin; i++) {
nfa_state_t *s = in[i];
switch (s->a.kind) {
case nfa_wild:
- case nfa_compl_wild:
/* Unconditional match; don't have to look at ch. */
break;
case nfa_single:
if (s->o.ch == ch) /* Character match. */
break;
continue; /* no match */
- case nfa_compl_single:
- if (s->o.ch == ch) /* Character match. */
- break;
- goto add_super; /* no match means we can move to super accept state */
case nfa_set:
if (char_set_contains(s->s.set, ch)) /* Set match. */
break;
continue; /* no match */
- case nfa_compl_set:
- if (char_set_contains(s->s.set, ch)) /* Set match. */
- break;
- goto add_super;
- case nfa_super_accept:
- /* super always transitions to itself, for any character. */
- add_super:
- if (!super_added) {
- super_added = 1;
- if (nmove >= NFA_SET_SIZE)
- internal_error("NFA set size exceeded");
- out[nmove++] = &nfa_super_accept_state;
- }
default:
/* Epsilon-transition and acceptance states have no character moves. */
continue;