aboutsummaryrefslogtreecommitdiffstats
path: root/field.c
diff options
context:
space:
mode:
Diffstat (limited to 'field.c')
-rwxr-xr-x[-rw-r--r--]field.c569
1 files changed, 534 insertions, 35 deletions
diff --git a/field.c b/field.c
index 5bd6c632..46c9c93d 100644..100755
--- a/field.c
+++ b/field.c
@@ -39,25 +39,25 @@ is_blank(int c)
typedef void (* Setfunc) P((long, char *, long, NODE *));
static long (*parse_field) P((long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, int));
+ Regexp *, Setfunc, NODE *, NODE *, int));
static void rebuild_record P((void));
static long re_parse_field P((long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, int));
+ Regexp *, Setfunc, NODE *, NODE *, int));
static long def_parse_field P((long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, int));
+ Regexp *, Setfunc, NODE *, NODE *, int));
static long posix_def_parse_field P((long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, int));
+ Regexp *, Setfunc, NODE *, NODE *, int));
static long null_parse_field P((long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, int));
+ Regexp *, Setfunc, NODE *, NODE *, int));
static long sc_parse_field P((long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, int));
+ Regexp *, Setfunc, NODE *, NODE *, int));
static long fw_parse_field P((long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, int));
+ Regexp *, Setfunc, NODE *, NODE *, int));
+static long fpat_parse_field P((long, char **, int, NODE *,
+ Regexp *, Setfunc, NODE *, NODE *, int));
static void set_element P((long num, char * str, long len, NODE *arr));
static void grow_fields_arr P((long num));
static void set_field P((long num, char *str, long len, NODE *dummy));
-static void update_PROCINFO P((char *subscript, char *str));
-
static char *parse_extent; /* marks where to restart parse of record */
static long parse_high_water = 0; /* field number that we have parsed so far */
@@ -74,11 +74,11 @@ int default_FS; /* TRUE when FS == " " */
Regexp *FS_re_yes_case = NULL;
Regexp *FS_re_no_case = NULL;
Regexp *FS_regexp = NULL;
+Regexp *FPAT_re_yes_case = NULL;
+Regexp *FPAT_re_no_case = NULL;
+Regexp *FPAT_regexp = NULL;
NODE *Null_field = NULL;
-/* using_FIELDWIDTHS --- static function, macro to avoid overhead */
-#define using_FIELDWIDTHS() (parse_field == fw_parse_field)
-
/* init_fields --- set up the fields array to start with */
void
@@ -213,9 +213,7 @@ rebuild_record()
n->stptr = cops;
unref(fields_arr[i]);
fields_arr[i] = n;
-#ifdef MBS_SUPPORT
assert((n->flags & WSTRCUR) == 0);
-#endif
}
cops += fields_arr[i]->stlen + ofslen;
}
@@ -365,6 +363,7 @@ re_parse_field(long up_to, /* parse only up to this field number */
Regexp *rp,
Setfunc set, /* routine to set the value of the parsed field */
NODE *n,
+ NODE *sep_arr, /* array of field separators (maybe NULL) */
int in_middle)
{
register char *scan = *buf;
@@ -372,6 +371,7 @@ re_parse_field(long up_to, /* parse only up to this field number */
register char *field;
register char *end = scan + len;
int regex_flags = RE_NEED_START;
+ char *sep;
#ifdef MBS_SUPPORT
size_t mbclen = 0;
mbstate_t mbs;
@@ -387,9 +387,17 @@ re_parse_field(long up_to, /* parse only up to this field number */
if (len == 0)
return nf;
- if (RS_is_null && default_FS)
+ if (RS_is_null && default_FS) {
+ sep = scan;
while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
scan++;
+ if (sep_arr != NULL && sep < scan)
+ set_element(nf, sep, (long)(scan - sep), sep_arr);
+ }
+
+ if (rp == NULL) /* use FS */
+ rp = FS_regexp;
+
field = scan;
while (scan < end
&& research(rp, scan, 0, (end - scan), regex_flags) != -1
@@ -397,7 +405,7 @@ re_parse_field(long up_to, /* parse only up to this field number */
regex_flags |= RE_NO_BOL;
if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */
#ifdef MBS_SUPPORT
- if (gawk_mb_cur_max > 1) {
+ if (gawk_mb_cur_max > 1) {
mbclen = mbrlen(scan, end-scan, &mbs);
if ((mbclen == 1) || (mbclen == (size_t) -1)
|| (mbclen == (size_t) -2) || (mbclen == 0)) {
@@ -417,6 +425,9 @@ re_parse_field(long up_to, /* parse only up to this field number */
}
(*set)(++nf, field,
(long)(scan + RESTART(rp, scan) - field), n);
+ if (sep_arr != NULL)
+ set_element(nf, scan + RESTART(rp, scan),
+ (long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
scan += REEND(rp, scan);
field = scan;
if (scan == end) /* FS at end of record */
@@ -446,6 +457,7 @@ def_parse_field(long up_to, /* parse only up to this field number */
Regexp *rp ATTRIBUTE_UNUSED,
Setfunc set, /* routine to set the value of the parsed field */
NODE *n,
+ NODE *sep_arr, /* array of field separators (maybe NULL) */
int in_middle ATTRIBUTE_UNUSED)
{
register char *scan = *buf;
@@ -453,6 +465,7 @@ def_parse_field(long up_to, /* parse only up to this field number */
register char *field;
register char *end = scan + len;
char sav;
+ char *sep;
if (up_to == UNLIMITED)
nf = 0;
@@ -474,20 +487,32 @@ def_parse_field(long up_to, /* parse only up to this field number */
/* because it will be destroyed now: */
*end = ' '; /* sentinel character */
+ sep = scan;
+
for (; nf < up_to; scan++) {
/*
* special case: fs is single space, strip leading whitespace
*/
while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
scan++;
+
+ if (sep_arr != NULL && scan > sep)
+ set_element(nf, sep, (long) (scan - sep), sep_arr);
+
if (scan >= end)
break;
+
field = scan;
+
while (*scan != ' ' && *scan != '\t' && *scan != '\n')
scan++;
+
(*set)(++nf, field, (long)(scan - field), n);
+
if (scan == end)
break;
+
+ sep = scan;
}
/* everything done, restore original char at *end */
@@ -514,6 +539,7 @@ posix_def_parse_field(long up_to, /* parse only up to this field number */
Regexp *rp ATTRIBUTE_UNUSED,
Setfunc set, /* routine to set the value of the parsed field */
NODE *n,
+ NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
int in_middle ATTRIBUTE_UNUSED)
{
register char *scan = *buf;
@@ -579,6 +605,7 @@ null_parse_field(long up_to, /* parse only up to this field number */
Regexp *rp ATTRIBUTE_UNUSED,
Setfunc set, /* routine to set the value of the parsed field */
NODE *n,
+ NODE *sep_arr, /* array of field separators (maybe NULL) */
int in_middle ATTRIBUTE_UNUSED)
{
register char *scan = *buf;
@@ -601,13 +628,18 @@ null_parse_field(long up_to, /* parse only up to this field number */
/* We treat it as a singlebyte character. */
mbclen = 1;
}
+ if (sep_arr != NULL && nf > 0)
+ set_element(nf, scan, 0L, sep_arr);
(*set)(++nf, scan, mbclen, n);
scan += mbclen;
}
} else
#endif
- for (; nf < up_to && scan < end; scan++)
+ for (; nf < up_to && scan < end; scan++) {
+ if (sep_arr != NULL && nf > 0)
+ set_element(nf, scan, 0L, sep_arr);
(*set)(++nf, scan, 1L, n);
+ }
*buf = scan;
return nf;
@@ -628,6 +660,7 @@ sc_parse_field(long up_to, /* parse only up to this field number */
Regexp *rp ATTRIBUTE_UNUSED,
Setfunc set, /* routine to set the value of the parsed field */
NODE *n,
+ NODE *sep_arr, /* array of field separators (maybe NULL) */
int in_middle ATTRIBUTE_UNUSED)
{
register char *scan = *buf;
@@ -678,6 +711,8 @@ sc_parse_field(long up_to, /* parse only up to this field number */
(*set)(++nf, field, (long)(scan - field), n);
if (scan == end)
break;
+ if (sep_arr != NULL)
+ set_element(nf, scan, 1L, sep_arr);
scan++;
if (scan == end) { /* FS at end of record */
(*set)(++nf, field, 0L, n);
@@ -706,6 +741,7 @@ fw_parse_field(long up_to, /* parse only up to this field number */
Regexp *rp ATTRIBUTE_UNUSED,
Setfunc set, /* routine to set the value of the parsed field */
NODE *n,
+ NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
int in_middle ATTRIBUTE_UNUSED)
{
register char *scan = *buf;
@@ -787,7 +823,9 @@ get_field(register long requested, Func_ptr *assign)
fields_arr[0]->stlen -
(parse_extent - fields_arr[0]->stptr),
save_FS, FS_regexp, set_field,
- (NODE *) NULL, in_middle);
+ (NODE *) NULL,
+ (NODE *) NULL,
+ in_middle);
parse_high_water = NF;
}
rebuild_record();
@@ -816,7 +854,7 @@ get_field(register long requested, Func_ptr *assign)
in_middle = TRUE;
parse_high_water = (*parse_field)(requested, &parse_extent,
fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
- save_FS, FS_regexp, set_field, (NODE *) NULL, in_middle);
+ save_FS, NULL, set_field, (NODE *) NULL, (NODE *) NULL, in_middle);
/*
* if we reached the end of the record, set NF to the number of
@@ -862,10 +900,10 @@ set_element(long num, char *s, long len, NODE *n)
NODE *
do_split(NODE *tree)
{
- NODE *src, *arr, *sep, *fs, *src2, *fs2, *tmp;
+ NODE *src, *arr, *sep, *fs, *sep_arr, *src2, *fs2, *tmp;
char *s;
long (*parseit) P((long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, int));
+ Regexp *, Setfunc, NODE *, NODE *, int));
Regexp *rp = NULL;
src = force_string(tree_eval(tree->lnode));
@@ -876,6 +914,30 @@ do_split(NODE *tree)
sep = tree->rnode->rnode->lnode;
+ sep_arr = tree->rnode->rnode->rnode;
+ if (sep_arr != NULL) {
+ static short warned1 = FALSE, warned2 = FALSE;
+
+ if (do_traditional || do_posix) {
+ fatal(_("split: fourth argument is a gawk extension"));
+ }
+
+ sep_arr = get_param(tree->rnode->rnode->rnode->lnode);
+
+ if (sep_arr->type != Node_var_array)
+ fatal(_("split: fourth argument is not an array"));
+
+ if (do_lint && ! warned1) {
+ warned1 = TRUE;
+ lintwarn(_("split: fourth argument is a gawk extension"));
+ }
+
+ if (do_lint_old && ! warned2) {
+ warned2 = TRUE;
+ warning(_("split: fourth argument is a gawk extension"));
+ }
+ }
+
if (src->stlen == 0) {
/*
* Skip the work if first arg is the null string.
@@ -887,13 +949,17 @@ do_split(NODE *tree)
if ((sep->re_flags & (FS_DFLT|CONSTANT)) == 0)
free_temp(tree_eval(sep->re_exp));
/*
- * And now we can safely turn off the array.
+ * And now we can safely turn off the arrays.
*/
assoc_clear(arr);
+
+ if (sep_arr != NULL)
+ assoc_clear(sep_arr);
+
return tmp_number((AWKNUM) 0);
}
- if ((sep->re_flags & FS_DFLT) != 0 && ! using_FIELDWIDTHS() && ! RS_is_null) {
+ if ((sep->re_flags & FS_DFLT) != 0 && current_field_sep() != Using_FIELDWIDTHS && ! RS_is_null) {
parseit = parse_field;
fs = force_string(FS_node->var_value);
rp = FS_regexp;
@@ -906,7 +972,7 @@ do_split(NODE *tree)
if (do_lint && ! warned) {
warned = TRUE;
- lintwarn(_("split: null string for third arg is a gawk extension"));
+ lintwarn(_("split: null string for third argument is a gawk extension"));
}
} else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
if (fs->stptr[0] == ' ') {
@@ -936,14 +1002,98 @@ do_split(NODE *tree)
assoc_clear(arr);
+ if (sep_arr != NULL)
+ assoc_clear(sep_arr);
+
s = src2->stptr;
tmp = tmp_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src2->stlen,
- fs2, rp, set_element, arr, FALSE));
+ fs2, rp, set_element, arr, sep_arr, FALSE));
unref(src2);
unref(fs2);
return tmp;
}
+/*
+ * do_patsplit --- implement patsplit(), semantics are same as for field
+ * splitting with FPAT.
+ */
+
+NODE *
+do_patsplit(NODE *tree)
+{
+ NODE *src, *arr, *sep, *fpat, *sep_arr, *src2, *fpat2, *tmp;
+ char *s;
+ Regexp *rp = NULL;
+
+ src = force_string(tree_eval(tree->lnode));
+
+ arr = get_param(tree->rnode->lnode);
+ if (arr->type != Node_var_array)
+ fatal(_("patsplit: second argument is not an array"));
+
+ sep = tree->rnode->rnode->lnode;
+
+ sep_arr = tree->rnode->rnode->rnode;
+ if (sep_arr != NULL) {
+ sep_arr = get_param(tree->rnode->rnode->rnode->lnode);
+
+ if (sep_arr->type != Node_var_array)
+ fatal(_("patsplit: fourth argument is not an array"));
+ }
+
+ if (src->stlen == 0) {
+ /*
+ * Skip the work if first arg is the null string.
+ */
+ free_temp(src);
+ /*
+ * Evaluate sep if it may have side effects.
+ */
+ if ((sep->re_flags & CONSTANT) == 0)
+ free_temp(tree_eval(sep->re_exp));
+ /*
+ * And now we can safely turn off the arrays.
+ */
+ assoc_clear(arr);
+
+ if (sep_arr != NULL)
+ assoc_clear(sep_arr);
+
+ return tmp_number((AWKNUM) 0);
+ }
+
+ fpat = force_string(tree_eval(sep->re_exp));
+ if (fpat->stlen == 0) {
+ fatal(_("patsplit: third argument must be non-null"));
+ }
+ rp = re_update(sep);
+
+ /*
+ * do dupnode(), to avoid problems like
+ * x = split(a["LINE"], a, a["FS"])
+ * since we assoc_clear the array. gack.
+ * this also gives us complete call by value semantics.
+ */
+ src2 = dupnode(src);
+ free_temp(src);
+
+ fpat2 = dupnode(fpat);
+ free_temp(fpat);
+
+ assoc_clear(arr);
+
+ if (sep_arr != NULL)
+ assoc_clear(sep_arr);
+
+ s = src2->stptr;
+ tmp = tmp_number((AWKNUM) fpat_parse_field(UNLIMITED, &s,
+ (int) src2->stlen, fpat2, rp,
+ set_element, arr, sep_arr, FALSE));
+ unref(src2);
+ unref(fpat2);
+ return tmp;
+}
+
/* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
void
@@ -1017,7 +1167,7 @@ set_FIELDWIDTHS()
i--;
FIELDWIDTHS[i+1] = -1;
- update_PROCINFO("FS", "FIELDWIDTHS");
+ update_PROCINFO_str("FS", "FIELDWIDTHS");
}
/* set_FS --- handle things when FS is assigned to */
@@ -1053,7 +1203,7 @@ set_FS()
FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
/* FS = FS */
- if (! using_FIELDWIDTHS()) {
+ if (current_field_sep() == Using_FS) {
return;
} else {
remake_re = FALSE;
@@ -1148,21 +1298,26 @@ choose_fs_function:
if (fs->stlen == 1 && parse_field == re_parse_field)
FS_regexp = FS_re_yes_case;
- update_PROCINFO("FS", "FS");
+ update_PROCINFO_str("FS", "FS");
}
-/* using_fieldwidths --- is FS or FIELDWIDTHS in use? */
+/* current_field_sep --- return what field separator is */
-int
-using_fieldwidths()
+field_sep_type
+current_field_sep()
{
- return using_FIELDWIDTHS();
+ if (parse_field == fw_parse_field)
+ return Using_FIELDWIDTHS;
+ else if (parse_field == fpat_parse_field)
+ return Using_FPAT;
+ else
+ return Using_FS;
}
-/* update_PROCINFO --- update PROCINFO[sub] when FS or FIELDWIDTHS set */
+/* update_PROCINFO_str --- update PROCINFO[sub] with string value */
-static void
-update_PROCINFO(char *subscript, char *str)
+void
+update_PROCINFO_str(char *subscript, char *str)
{
NODE **aptr;
@@ -1172,3 +1327,347 @@ update_PROCINFO(char *subscript, char *str)
aptr = assoc_lookup(PROCINFO_node, tmp_string(subscript, strlen(subscript)), FALSE);
assign_val(aptr, tmp_string(str, strlen(str)));
}
+
+/* update_PROCINFO_num --- update PROCINFO[sub] with numeric value */
+
+void
+update_PROCINFO_num(char *subscript, AWKNUM val)
+{
+ NODE **aptr;
+
+ if (PROCINFO_node == NULL)
+ return;
+
+ aptr = assoc_lookup(PROCINFO_node, tmp_string(subscript, strlen(subscript)), FALSE);
+ *aptr = make_number(val);
+}
+
+/* set_FPAT --- handle an assignment to FPAT */
+
+void
+set_FPAT()
+{
+ static short warned = FALSE;
+ static NODE *save_fpat = NULL;
+ int remake_re = TRUE;
+ NODE *fpat;
+
+ if (do_lint && ! warned) {
+ warned = TRUE;
+ lintwarn(_("`FPAT' is a gawk extension"));
+ }
+ if (do_traditional) /* quick and dirty, does the trick */
+ return;
+
+ /*
+ * If changing the way fields are split, obey least-suprise
+ * semantics, and force $0 to be split totally.
+ */
+ if (fields_arr != NULL)
+ (void) get_field(UNLIMITED - 1, 0);
+
+ /* It's possible that only IGNORECASE changed, or FPAT = FPAT */
+ /*
+ * This comparison can't use cmp_nodes(), which pays attention
+ * to IGNORECASE, and that's not what we want.
+ */
+ if (save_fpat
+ && FPAT_node->var_value->stlen == save_fpat->stlen
+ && memcmp(FPAT_node->var_value->stptr, save_fpat->stptr, save_fpat->stlen) == 0) {
+ if (FPAT_regexp != NULL)
+ FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
+
+ /* FPAT = FPAT */
+ if (current_field_sep() == Using_FPAT) {
+ return;
+ } else {
+ remake_re = FALSE;
+ goto set_fpat_function;
+ }
+ }
+
+ unref(save_fpat);
+ save_fpat = dupnode(FPAT_node->var_value);
+ if (FPAT_regexp != NULL) {
+ refree(FPAT_re_yes_case);
+ refree(FPAT_re_no_case);
+ FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
+ }
+
+set_fpat_function:
+ fpat = force_string(FPAT_node->var_value);
+ parse_field = fpat_parse_field;
+
+ if (remake_re) {
+ if (FPAT_regexp != NULL) {
+ refree(FPAT_re_yes_case);
+ refree(FPAT_re_no_case);
+ FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
+ }
+
+ FPAT_re_yes_case = make_regexp(fpat->stptr, fpat->stlen, FALSE, TRUE);
+ FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, TRUE, TRUE);
+ FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
+ }
+
+ update_PROCINFO_str("FS", "FPAT");
+}
+
+/*
+ * increment_scan --- macro to move scan pointer ahead by one character.
+ * Implementation varies if doing MBS or not.
+ */
+
+#ifdef MBS_SUPPORT
+#define increment_scan(scanp, len) incr_scan(scanp, len, & mbs)
+#else
+#define increment_scan(scanp, len) ((*scanp)++)
+#endif
+
+#ifdef MBS_SUPPORT
+/* incr_scan --- MBS version of increment_scan() */
+
+static void
+incr_scan(char **scanp, size_t len, mbstate_t *mbs)
+{
+ size_t mbclen = 0;
+
+ if (gawk_mb_cur_max > 1) {
+ mbclen = mbrlen(*scanp, len, mbs);
+ if ( (mbclen == 1)
+ || (mbclen == (size_t) -1)
+ || (mbclen == (size_t) -2)
+ || (mbclen == 0)) {
+ /* We treat it as a singlebyte character. */
+ mbclen = 1;
+ }
+ *scanp += mbclen;
+ } else
+ (*scanp)++;
+}
+#endif
+
+/*
+ * fpat_parse_field --- parse fields using a regexp.
+ *
+ * This is called both from get_field() and from do_patsplit()
+ * via (*parse_field)(). This variation is for when FPAT is a regular
+ * expression -- use the value to find field contents.
+ *
+ * This was really hard to get right. It happens to bear many resemblances
+ * to issues I had with getting gsub right with null matches. When dealing
+ * with that I prototyped in awk and had the foresight to save the awk code
+ * over in the C file. Starting with that as a base, I finally got to this
+ * awk code to do what I needed, and then translated it into C. Fortunately
+ * the C code bears a closer correspondance to the awk code here than over
+ * by gsub.
+ *
+ * BEGIN {
+ * FALSE = 0
+ * TRUE = 1
+ *
+ * fpat[1] = "([^,]*)|(\"[^\"]+\")"
+ * fpat[2] = fpat[1]
+ * fpat[3] = fpat[1]
+ * fpat[4] = "aa+"
+ * fpat[5] = fpat[4]
+ *
+ * data[1] = "Robbins,,Arnold,"
+ * data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
+ * data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
+ * data[4] = "bbbaaacccdddaaaaaqqqq"
+ * data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
+ *
+ * for (i = 1; i in data; i++) {
+ * printf("Splitting: <%s>\n", data[i])
+ * n = mypatsplit(data[i], fields, fpat[i], seps)
+ * print "n =", n
+ * for (j = 1; j <= n; j++)
+ * printf("fields[%d] = <%s>\n", j, fields[j])
+ * for (j = 0; j in seps; j++)
+ * printf("seps[%s] = <%s>\n", j, seps[j])
+ * }
+ * }
+ *
+ * function mypatsplit(string, array, pattern, seps,
+ * eosflag, non_empty, nf) # locals
+ * {
+ * delete array
+ * delete seps
+ * if (length(string) == 0)
+ * return 0
+ *
+ * eosflag = non_empty = FALSE
+ * nf = 0
+ * while (match(string, pattern)) {
+ * if (RLENGTH > 0) { # easy case
+ * non_empty = TRUE
+ * if (! (nf in seps)) {
+ * if (RSTART == 1) # match at front of string
+ * seps[nf] = ""
+ * else
+ * seps[nf] = substr(string, 1, RSTART - 1)
+ * }
+ * array[++nf] = substr(string, RSTART, RLENGTH)
+ * string = substr(string, RSTART+RLENGTH)
+ * if (length(string) == 0)
+ * break
+ * } else if (non_empty) {
+ * # last match was non-empty, and at the
+ * # current character we get a zero length match,
+ * # which we don't want, so skip over it
+ * non_empty = FALSE
+ * seps[nf] = substr(string, 1, 1)
+ * string = substr(string, 2)
+ * } else {
+ * # 0 length match
+ * if (! (nf in seps)) {
+ * if (RSTART == 1)
+ * seps[nf] = ""
+ * else
+ * seps[nf] = substr(string, 1, RSTART - 1)
+ * }
+ * array[++nf] = ""
+ * if (! non_empty && ! eosflag) { # prev was empty
+ * seps[nf] = substr(string, 1, 1)
+ * }
+ * if (RSTART == 1) {
+ * string = substr(string, 2)
+ * } else {
+ * string = substr(string, RSTART + 1)
+ * }
+ * non_empty = FALSE
+ * }
+ * if (length(string) == 0) {
+ * if (eosflag)
+ * break
+ * else
+ * eosflag = TRUE
+ * }
+ * }
+ * if (length(string) > 0)
+ * seps[nf] = string
+ *
+ * return length(array)
+ * }
+ */
+static long
+fpat_parse_field(long up_to, /* parse only up to this field number */
+ char **buf, /* on input: string to parse; on output: point to start next */
+ int len,
+ NODE *fs ATTRIBUTE_UNUSED,
+ Regexp *rp,
+ Setfunc set, /* routine to set the value of the parsed field */
+ NODE *n,
+ NODE *sep_arr, /* array of field separators (may be NULL) */
+ int in_middle)
+{
+ char *scan = *buf;
+ long nf = parse_high_water;
+ char *start;
+ char *end = scan + len;
+ int regex_flags = RE_NEED_START;
+ int need_to_set_sep;
+ int non_empty;
+ int eosflag;
+#ifdef MBS_SUPPORT
+ mbstate_t mbs;
+
+ if (gawk_mb_cur_max > 1)
+ memset(&mbs, 0, sizeof(mbstate_t));
+#endif
+
+ if (in_middle)
+ regex_flags |= RE_NO_BOL;
+
+ if (up_to == UNLIMITED)
+ nf = 0;
+
+ if (len == 0)
+ return nf;
+
+ if (rp == NULL) /* use FPAT */
+ rp = FPAT_regexp;
+
+ eosflag = non_empty = FALSE;
+ need_to_set_sep = TRUE;
+ start = scan;
+ while (research(rp, scan, 0, (end - scan), regex_flags) != -1
+ && nf < up_to) {
+ regex_flags |= RE_NO_BOL;
+
+ if (REEND(rp, scan) > RESTART(rp, scan)) { /* if (RLENGTH > 0) */
+ non_empty = TRUE;
+ if (sep_arr != NULL && need_to_set_sep) {
+ if (RESTART(rp, scan) == 0) /* match at front */
+ set_element(nf, start, 0L, sep_arr);
+ else
+ set_element(nf,
+ start,
+ (long) RESTART(rp, scan),
+ sep_arr);
+ }
+ /* field is text that matched */
+ (*set)(++nf,
+ scan + RESTART(rp, scan),
+ (long)(REEND(rp, scan) - RESTART(rp, scan)),
+ n);
+
+ scan += REEND(rp, scan);
+ if (scan >= end)
+ break;
+ need_to_set_sep = TRUE;
+ } else if (non_empty) { /* else if non_empty */
+ /*
+ * last match was non-empty, and at the
+ * current character we get a zero length match,
+ * which we don't want, so skip over it
+ */
+ non_empty = FALSE;
+ if (sep_arr != NULL) {
+ need_to_set_sep = FALSE;
+ set_element(nf, start, 1L, sep_arr);
+ }
+ increment_scan(& scan, end - scan);
+ } else {
+ /* 0 length match */
+ if (sep_arr != NULL && need_to_set_sep) {
+ if (RESTART(rp, scan) == 0) /* RSTART == 1 */
+ set_element(nf, start, 0L, sep_arr);
+ else
+ set_element(nf, start,
+ (long) RESTART(rp, scan),
+ sep_arr);
+ }
+ need_to_set_sep = TRUE;
+ (*set)(++nf, scan, 0L, n);
+ if (! non_empty && ! eosflag) { /* prev was empty */
+ if (sep_arr != NULL) {
+ set_element(nf, start, 1L, sep_arr);
+ need_to_set_sep = FALSE;
+ }
+ }
+ if (RESTART(rp, scan) == 0)
+ increment_scan(& scan, end - scan);
+ else {
+ scan += RESTART(rp, scan);
+ }
+ non_empty = FALSE;
+ }
+ if (scan >= end) { /* length(string) == 0 */
+ if (eosflag)
+ break;
+ else
+ eosflag = TRUE;
+ }
+
+ start = scan;
+ }
+ if (scan < end) {
+ if (sep_arr != NULL)
+ set_element(nf, scan, (long) (end - scan), sep_arr);
+ }
+
+ *buf = scan;
+ return nf;
+}