summaryrefslogtreecommitdiffstats
path: root/regex.c
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2014-03-09 00:02:03 -0800
committerKaz Kylheku <kaz@kylheku.com>2014-03-09 00:02:03 -0800
commit01504b7436608a7501bb06f4f1b965607ceb1345 (patch)
tree648006cb0b462d91b5e32a92d70d5e160a89a476 /regex.c
parent7a3e541d24ea5d15ff7ee28a3505a9f33fd049f8 (diff)
downloadtxr-01504b7436608a7501bb06f4f1b965607ceb1345.tar.gz
txr-01504b7436608a7501bb06f4f1b965607ceb1345.tar.bz2
txr-01504b7436608a7501bb06f4f1b965607ceb1345.zip
Issue: match_regex and search_regex were continuing to feed characters
to the regex machine even when there is no transition available. This was due to the broken return value protocol of regex_machine_feed. For instance for the regex / +/ (one or more spaces), after matching some spaces, it would report REGM_INCOMPLETE for additional non-space characters, never reporting REGM_FAIL. * regex.c (regm_result_t): Block comment added, documenting protocol. (regex_machine_feed): Return REGM_FAIL if there are no transitions for the given character, even a partial match has been recorded. This is a signal to stop feeding more characters. At that point, the function can be called with a null character to distinguish the three cases: fail, partial or full match. (search_regex): Now when the search loop gets a REGM_FAIL, it can no longer assume that nothing was matched and the search must restart at the next position. Upon the REGM_FAIL signal, it is necesary to seal the search by feeding in the 0 character. Only if that returns REGM_FAIL is it a no match situation. Otherwise it is actually a match!
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c64
1 files changed, 44 insertions, 20 deletions
diff --git a/regex.c b/regex.c
index 2e24338d..2137ca56 100644
--- a/regex.c
+++ b/regex.c
@@ -54,8 +54,32 @@ typedef struct nfa {
nfa_state_t *accept;
} nfa_t;
+/*
+ * Result from regex_machine_feed.
+ * These values have two meanings, based on whether
+ * the matching is still open (characters are being fed)
+ * or finalized.
+ *
+ * When feeding characters:
+ * REGM_INCOMPLETE: no match at this character, but matching can continue.
+ * REGM_FAIL: no more state transitions are possible.
+ * REGM_MATCH: match (accept state) for this character.
+ *
+ * When the end of the input is encountered, or a REGM_FAIL,
+ * then regex_machine_feed is called one more time with
+ * the null character. It then reports:
+ * REGM_INCOMPLETE: there was a partial match for the input.
+ * REGM_FAIL: none of the input matched.
+ * REGM_MATCH: the input was completely matched
+ *
+ * Note that a REGM_FAIL (no transitions) during the character feeding phase
+ * can turn into REGM_INCOMPLETE (partial match) when the match is sealed with
+ * the null character signal!
+ */
typedef enum regm_result {
- REGM_INCOMPLETE, REGM_FAIL, REGM_MATCH
+ REGM_INCOMPLETE,
+ REGM_FAIL,
+ REGM_MATCH
} regm_result_t;
typedef union regex_machine regex_machine_t;
@@ -1097,7 +1121,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
int i, nout = 0;
int stackp = 0;
- /* First, add all states in the input state to the closure,
+ /* First, add all states in the input set to the closure,
push them on the stack, and mark them as visited. */
for (i = 0; i < nin; i++) {
if (stackp >= NFA_SET_SIZE)
@@ -1728,16 +1752,14 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
regm->n.nmove, regm->n.clos,
regm->n.visited++, &accept);
- if (accept)
- regm->n.last_accept_pos = regm->n.count;
- }
+ regm->n.nfa.start->a.visited = regm->n.visited;
- regm->n.nfa.start->a.visited = regm->n.visited;
+ if (accept) {
+ regm->n.last_accept_pos = regm->n.count;
+ return REGM_MATCH;
+ }
- if (ch && regm->n.nclos != 0) {
- if (accept)
- return REGM_MATCH;
- return REGM_INCOMPLETE;
+ return (regm->n.nclos != 0) ? REGM_INCOMPLETE : REGM_FAIL;
}
} else {
val accept = nil;
@@ -1745,14 +1767,13 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
if (ch != 0) {
regm->d.count++;
regm->d.deriv = reg_derivative(regm->d.deriv, chr(ch));
- if ((accept = reg_nullable(regm->d.deriv)))
+
+ if ((accept = reg_nullable(regm->d.deriv))) {
regm->d.last_accept_pos = regm->d.count;
- }
+ return REGM_MATCH;
+ }
- if (ch && regm->d.deriv != t) {
- if (accept)
- return REGM_MATCH;
- return REGM_INCOMPLETE;
+ return (regm->d.deriv != t) ? REGM_INCOMPLETE : REGM_FAIL;
}
}
@@ -1766,7 +1787,6 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
return REGM_INCOMPLETE;
}
-
val search_regex(val haystack, val needle_regex, val start,
val from_end)
{
@@ -1798,9 +1818,13 @@ again:
last_res = regex_machine_feed(&regm, c_chr(chr_str(haystack, i)));
if (last_res == REGM_FAIL) {
- regex_machine_reset(&regm);
- pos = plus(pos, one);
- goto again;
+ last_res = regex_machine_feed(&regm, 0);
+ if (last_res == REGM_FAIL) {
+ regex_machine_reset(&regm);
+ pos = plus(pos, one);
+ goto again;
+ }
+ break;
}
}