aboutsummaryrefslogtreecommitdiffstats
path: root/field.c
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2017-04-12 12:37:00 +0300
committerArnold D. Robbins <arnold@skeeve.com>2017-04-12 12:37:00 +0300
commitc8d82e842e7ad94cae039e6c978b4bea1a31c4d5 (patch)
treeb5da46c4c3e0f96683a42c6db5a32ea96aa854e3 /field.c
parent8f83ab76a1d8861d9a992290f2691443d5169c89 (diff)
downloadegawk-c8d82e842e7ad94cae039e6c978b4bea1a31c4d5.tar.gz
egawk-c8d82e842e7ad94cae039e6c978b4bea1a31c4d5.tar.bz2
egawk-c8d82e842e7ad94cae039e6c978b4bea1a31c4d5.zip
Fix FPAT problems.
Diffstat (limited to 'field.c')
-rw-r--r--field.c249
1 files changed, 85 insertions, 164 deletions
diff --git a/field.c b/field.c
index a3be9773..8145141c 100644
--- a/field.c
+++ b/field.c
@@ -1502,101 +1502,65 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs)
* via (*parse_field)(). This variation is for when FPAT is a regular
* expression -- use the value to find field contents.
*
- * This was really hard to get right. It happens to bear many resemblances
- * to issues I had with getting gsub right with null matches. When dealing
- * with that I prototyped in awk and had the foresight to save the awk code
- * over in the C file. Starting with that as a base, I finally got to this
- * awk code to do what I needed, and then translated it into C. Fortunately
- * the C code bears a closer correspondance to the awk code here than over
- * by gsub.
+ * The FPAT parsing logic is a bit difficult to specify. In particular
+ * to allow null fields at certain locations. To make the code as robust
+ * as possible, an awk reference implementation was written and tested
+ * as a first step, and later recoded in C, preserving its structure as
+ * much as possible.
*
- * BEGIN {
- * false = 0
- * true = 1
- *
- * fpat[1] = "([^,]*)|(\"[^\"]+\")"
- * fpat[2] = fpat[1]
- * fpat[3] = fpat[1]
- * fpat[4] = "aa+"
- * fpat[5] = fpat[4]
- *
- * data[1] = "Robbins,,Arnold,"
- * data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
- * data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
- * data[4] = "bbbaaacccdddaaaaaqqqq"
- * data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
- *
- * for (i = 1; i in data; i++) {
- * printf("Splitting: <%s>\n", data[i])
- * n = mypatsplit(data[i], fields, fpat[i], seps)
- * print "n =", n
- * for (j = 1; j <= n; j++)
- * printf("fields[%d] = <%s>\n", j, fields[j])
- * for (j = 0; j in seps; j++)
- * printf("seps[%s] = <%s>\n", j, seps[j])
- * }
- * }
- *
- * function mypatsplit(string, array, pattern, seps,
- * eosflag, non_empty, nf) # locals
+ * # Reference implementation of the FPAT record parsing.
+ * #
+ * # Each loop iteration identifies a (separator[n-1],field[n]) pair.
+ * # Each loop iteration must consume some characters, except for the first field.
+ * # So a null field is only valid as a first field or after a non-null separator.
+ * # A null record has no fields (not a single null field).
+ *
+ * function refpatsplit(string, fields, pattern, seps,
+ * parse_start, sep_start, field_start, field_length, field_found, nf) # locals
* {
- * delete array
- * delete seps
- * if (length(string) == 0)
- * return 0
- *
- * eosflag = non_empty = false
- * nf = 0
- * while (match(string, pattern)) {
- * if (RLENGTH > 0) { # easy case
- * non_empty = true
- * if (! (nf in seps)) {
- * if (RSTART == 1) # match at front of string
- * seps[nf] = ""
- * else
- * seps[nf] = substr(string, 1, RSTART - 1)
- * }
- * array[++nf] = substr(string, RSTART, RLENGTH)
- * string = substr(string, RSTART+RLENGTH)
- * if (length(string) == 0)
- * break
- * } else if (non_empty) {
- * # last match was non-empty, and at the
- * # current character we get a zero length match,
- * # which we don't want, so skip over it
- * non_empty = false
- * seps[nf] = substr(string, 1, 1)
- * string = substr(string, 2)
- * } else {
- * # 0 length match
- * if (! (nf in seps)) {
- * if (RSTART == 1)
- * seps[nf] = ""
- * else
- * seps[nf] = substr(string, 1, RSTART - 1)
- * }
- * array[++nf] = ""
- * if (! non_empty && ! eosflag) { # prev was empty
- * seps[nf] = substr(string, 1, 1)
- * }
- * if (RSTART == 1) {
- * string = substr(string, 2)
- * } else {
- * string = substr(string, RSTART + 1)
- * }
- * non_empty = false
- * }
- * if (length(string) == 0) {
- * if (eosflag)
- * break
- * else
- * eosflag = true
- * }
- * }
- * if (length(string) > 0)
- * seps[nf] = string
- *
- * return length(array)
+ * # Local state variables:
+ * # - parse_start: pointer to the first not yet consumed character
+ * # - sep_start: pointer to the beginning of the parsed separator
+ * # - field start: pointer to the beginning of the parsed field
+ * # - field length: length of the parsed field
+ * # - field_found: flag for succesful field match
+ * # - nf: Number of fields found so far
+ *
+ * # Prepare for parsing
+ * parse_start = 1 # first not yet parsed char
+ * nf = 0 # fields found so far
+ * delete fields
+ * delete seps
+ *
+ * # Loop that consumes the whole record
+ * while (parse_start <= length(string)) { # still something to parse
+ *
+ * # first attempt to match the next field
+ * sep_start = parse_start
+ * field_found = match(substr(string, parse_start), pattern)
+ *
+ * # check for an invalid null field and retry one character away
+ * if (nf > 0 && field_found && RSTART==1 && RLENGTH==0) {
+ * parse_start++
+ * field_found = match(substr(string, parse_start), pattern)
+ * }
+ *
+ * # store the (sep[n-1],field[n]) pair
+ * if (field_found) {
+ * field_start = parse_start + RSTART - 1
+ * field_length = RLENGTH
+ * seps[nf] = substr(string, sep_start, field_start-sep_start)
+ * fields[++nf] = substr(string, field_start, field_length)
+ * parse_start = field_start + field_length
+ *
+ * # store the final extra sep after the last field
+ * } else {
+ * seps[nf] = substr(string, sep_start)
+ * parse_start = length(string) + 1
+ * }
+ * }
+ *
+ * return nf
* }
*/
static long
@@ -1615,10 +1579,9 @@ fpat_parse_field(long up_to, /* parse only up to this field number */
char *start;
char *end = scan + len;
int regex_flags = RE_NEED_START;
- bool need_to_set_sep;
- bool non_empty;
- bool eosflag;
mbstate_t mbs;
+ char* field_start;
+ bool field_found;
memset(&mbs, 0, sizeof(mbstate_t));
@@ -1631,90 +1594,48 @@ fpat_parse_field(long up_to, /* parse only up to this field number */
if (rp == NULL) /* use FPAT */
rp = FPAT_regexp;
- if (in_middle) {
- regex_flags |= RE_NO_BOL;
- }
- non_empty = rp->non_empty;
+ while (scan <= end && nf < up_to) { /* still something to parse */
- eosflag = false;
- need_to_set_sep = true;
- start = scan;
- while (research(rp, scan, 0, (end - scan), regex_flags) != -1
- && nf < up_to) {
+ /* first attempt to match the next field */
+ start = scan;
+ field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
+
+ /* check for an invalid null field and retry one character away */
+ if (nf > 0 && field_found && REEND(rp, scan) == 0) { /* invalid null field */
+ increment_scan(& scan, end - scan);
+ field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
+ }
- if (REEND(rp, scan) > RESTART(rp, scan)) { /* if (RLENGTH > 0) */
- non_empty = true;
- if (sep_arr != NULL && need_to_set_sep) {
- if (RESTART(rp, scan) == 0) /* match at front */
- set_element(nf, start, 0L, sep_arr);
+ /* store the (sep[n-1],field[n]) pair */
+ if (field_found) {
+ field_start = scan + RESTART(rp, scan);
+ if (sep_arr != NULL) { /* store the separator */
+ if (field_start == start) /* match at front */
+ set_element(nf, start, 0L, sep_arr);
else
- set_element(nf,
+ set_element(nf,
start,
- (long) RESTART(rp, scan),
+ (long) (field_start - start),
sep_arr);
}
/* field is text that matched */
(*set)(++nf,
- scan + RESTART(rp, scan),
+ field_start,
(long)(REEND(rp, scan) - RESTART(rp, scan)),
n);
-
scan += REEND(rp, scan);
- if (scan >= end)
- break;
- need_to_set_sep = true;
- } else if (non_empty) { /* else if non_empty */
+
+ } else {
/*
- * last match was non-empty, and at the
- * current character we get a zero length match,
- * which we don't want, so skip over it
+ * No match, store the final extra separator after
+ * the last field.
*/
- non_empty = false;
- if (sep_arr != NULL) {
- need_to_set_sep = false;
- set_element(nf, start, 1L, sep_arr);
- }
- increment_scan(& scan, end - scan);
- } else {
- /* 0 length match */
- if (sep_arr != NULL && need_to_set_sep) {
- if (RESTART(rp, scan) == 0) /* RSTART == 1 */
- set_element(nf, start, 0L, sep_arr);
- else
- set_element(nf, start,
- (long) RESTART(rp, scan),
- sep_arr);
- }
- need_to_set_sep = true;
- (*set)(++nf, scan, 0L, n);
- if (! non_empty && ! eosflag) { /* prev was empty */
- if (sep_arr != NULL) {
- set_element(nf, start, 1L, sep_arr);
- need_to_set_sep = false;
- }
- }
- if (RESTART(rp, scan) == 0)
- increment_scan(& scan, end - scan);
- else {
- scan += RESTART(rp, scan);
- }
- non_empty = false;
- }
- if (scan >= end) { /* length(string) == 0 */
- if (eosflag)
- break;
- else
- eosflag = true;
+ if (sep_arr != NULL)
+ set_element(nf, start, (long) (end - start), sep_arr);
+ scan = end + 1;
}
-
- start = scan;
- }
- if (scan < end) {
- if (sep_arr != NULL)
- set_element(nf, scan, (long) (end - scan), sep_arr);
}
*buf = scan;
- rp->non_empty = non_empty;
return nf;
}