aboutsummaryrefslogtreecommitdiffstats
path: root/field.c
diff options
context:
space:
mode:
Diffstat (limited to 'field.c')
-rw-r--r--field.c709
1 files changed, 343 insertions, 366 deletions
diff --git a/field.c b/field.c
index 5f5b2b65..d8c97413 100644
--- a/field.c
+++ b/field.c
@@ -2,22 +2,22 @@
* field.c - routines for dealing with fields and record parsing
*/
-/*
+/*
* Copyright (C) 1986, 1988, 1989, 1991-2016 the Free Software Foundation, Inc.
- *
+ *
* This file is part of GAWK, the GNU implementation of the
* AWK Programming Language.
- *
+ *
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
@@ -38,25 +38,34 @@ is_blank(int c)
typedef void (* Setfunc)(long, char *, long, NODE *);
-static long (*parse_field)(long, char **, int, NODE *,
+/* is the API currently overriding the default parsing mechanism? */
+static bool api_parser_override = false;
+typedef long (*parse_field_func_t)(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
+static parse_field_func_t parse_field;
+/*
+ * N.B. The normal_parse_field function pointer contains the parse_field value
+ * that should be used except when API field parsing is overriding the default
+ * field parsing mechanism.
+ */
+static parse_field_func_t normal_parse_field;
static long re_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
static long def_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
-static long posix_def_parse_field(long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, NODE *, bool);
static long null_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
static long sc_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
static long fw_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
+static const awk_fieldwidth_info_t *api_fw = NULL;
static long fpat_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
static void set_element(long num, char * str, long len, NODE *arr);
static void grow_fields_arr(long num);
static void set_field(long num, char *str, long len, NODE *dummy);
+static void purge_record(void);
static char *parse_extent; /* marks where to restart parse of record */
static long parse_high_water = 0; /* field number that we have parsed so far */
@@ -65,7 +74,7 @@ static bool resave_fs;
static NODE *save_FS; /* save current value of FS when line is read,
* to be used in deferred parsing
*/
-static int *FIELDWIDTHS = NULL;
+static awk_fieldwidth_info_t *FIELDWIDTHS = NULL;
NODE **fields_arr; /* array of pointers to the field nodes */
bool field0_valid; /* $(>0) has not been changed yet */
@@ -95,7 +104,7 @@ init_fields()
getnode(Null_field);
*Null_field = *Nnull_string;
Null_field->valref = 1;
- Null_field->flags = (FIELD|STRCUR|STRING|NULL_FIELD);
+ Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */
field0_valid = true;
}
@@ -133,7 +142,7 @@ set_field(long num,
n = fields_arr[num];
n->stptr = str;
n->stlen = len;
- n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
+ n->flags = (STRCUR|STRING|USER_INPUT); /* do not set MALLOC */
}
/* rebuild_record --- Someone assigned a value to $(something).
@@ -163,7 +172,7 @@ rebuild_record()
tlen += (NF - 1) * OFSlen;
if ((long) tlen < 0)
tlen = 0;
- emalloc(ops, char *, tlen + 2, "rebuild_record");
+ emalloc(ops, char *, tlen + 1, "rebuild_record");
cops = ops;
ops[0] = '\0';
for (i = 1; i <= NF; i++) {
@@ -196,29 +205,32 @@ rebuild_record()
*/
for (cops = ops, i = 1; i <= NF; i++) {
NODE *r = fields_arr[i];
- if (r->stlen > 0) {
+ /*
+ * There is no reason to copy malloc'ed fields to point into
+ * the new $0 buffer, although that's how previous versions did
+ * it. It seems faster to leave the malloc'ed fields in place.
+ */
+ if (r->stlen > 0 && (r->flags & MALLOC) == 0) {
NODE *n;
getnode(n);
- if ((r->flags & FIELD) == 0) {
- *n = *Null_field;
- n->stlen = r->stlen;
- if ((r->flags & (NUMCUR|NUMBER)) != 0) {
- n->flags |= (r->flags & (MPFN|MPZN|NUMCUR|NUMBER));
-#ifdef HAVE_MPFR
- if (is_mpg_float(r)) {
- mpfr_init(n->mpg_numbr);
- mpfr_set(n->mpg_numbr, r->mpg_numbr, ROUND_MODE);
- } else if (is_mpg_integer(r)) {
- mpz_init(n->mpg_i);
- mpz_set(n->mpg_i, r->mpg_i);
- } else
-#endif
- n->numbr = r->numbr;
- }
- } else {
- *n = *r;
- n->flags &= ~(MALLOC|STRING);
+ *n = *r;
+ if (r->valref > 1) {
+ /*
+ * This probably never happens, since it
+ * was not considered by previous versions of
+ * this function. But it seems clear that
+ * we can't leave r's stptr pointing into the
+ * old $0 buffer that we are about to unref.
+ * It's not a priori obvious that valref must be
+ * 1 in all cases, so it seems wise to suppport
+ * this corner case. The only question is
+ * whether to add a warning message.
+ */
+ emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record");
+ memcpy(r->stptr, cops, r->stlen);
+ r->stptr[r->stlen] = '\0';
+ r->flags |= MALLOC;
}
n->stptr = cops;
@@ -229,6 +241,10 @@ rebuild_record()
cops += fields_arr[i]->stlen + OFSlen;
}
+ assert((fields_arr[0]->flags & MALLOC) == 0
+ ? fields_arr[0]->valref == 1
+ : true);
+
unref(fields_arr[0]);
fields_arr[0] = tmp;
@@ -246,7 +262,7 @@ rebuild_record()
* but better correct than fast.
*/
void
-set_record(const char *buf, int cnt)
+set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *fw)
{
NODE *n;
static char *databuf;
@@ -254,14 +270,12 @@ set_record(const char *buf, int cnt)
#define INITIAL_SIZE 512
#define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */
- reset_record();
+ purge_record();
/* buffer management: */
if (databuf_size == 0) { /* first time */
- emalloc(databuf, char *, INITIAL_SIZE, "set_record");
+ ezalloc(databuf, char *, INITIAL_SIZE, "set_record");
databuf_size = INITIAL_SIZE;
- memset(databuf, '\0', INITIAL_SIZE);
-
}
/*
* Make sure there's enough room. Since we sometimes need
@@ -269,8 +283,11 @@ set_record(const char *buf, int cnt)
* databuf_size is > cnt after allocation.
*/
if (cnt >= databuf_size) {
- while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
+ do {
+ if (databuf_size > MAX_SIZE/2)
+ fatal(_("input record too large"));
databuf_size *= 2;
+ } while (cnt >= databuf_size);
erealloc(databuf, char *, databuf_size, "set_record");
memset(databuf, '\0', databuf_size);
}
@@ -278,21 +295,38 @@ set_record(const char *buf, int cnt)
memcpy(databuf, buf, cnt);
/*
- * Add terminating '\0' so that C library routines
+ * Add terminating '\0' so that C library routines
* will know when to stop.
*/
databuf[cnt] = '\0';
/* manage field 0: */
+ assert((fields_arr[0]->flags & MALLOC) == 0
+ ? fields_arr[0]->valref == 1
+ : true);
+
unref(fields_arr[0]);
getnode(n);
n->stptr = databuf;
n->stlen = cnt;
n->valref = 1;
n->type = Node_val;
- n->stfmt = -1;
- n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
+ n->stfmt = STFMT_UNUSED;
+ n->flags = (STRING|STRCUR|USER_INPUT); /* do not set MALLOC */
fields_arr[0] = n;
+ if (fw != api_fw) {
+ if ((api_fw = fw) != NULL) {
+ if (! api_parser_override) {
+ api_parser_override = true;
+ parse_field = fw_parse_field;
+ update_PROCINFO_str("FS", "API");
+ }
+ } else if (api_parser_override) {
+ api_parser_override = false;
+ parse_field = normal_parse_field;
+ update_PROCINFO_str("FS", current_field_sep_str());
+ }
+ }
#undef INITIAL_SIZE
#undef MAX_SIZE
@@ -303,13 +337,21 @@ set_record(const char *buf, int cnt)
void
reset_record()
{
+ fields_arr[0] = force_string(fields_arr[0]);
+ purge_record();
+}
+
+static void
+purge_record()
+{
int i;
NODE *n;
- fields_arr[0] = force_string(fields_arr[0]);
-
NF = -1;
for (i = 1; i <= parse_high_water; i++) {
+ assert((fields_arr[i]->flags & MALLOC) == 0
+ ? fields_arr[i]->valref == 1
+ : true);
unref(fields_arr[i]);
getnode(n);
*n = *Null_field;
@@ -341,7 +383,7 @@ set_NF()
assert(NF != -1);
(void) force_number(NF_node->var_value);
- nf = get_number_si(NF_node->var_value);
+ nf = get_number_si(NF_node->var_value);
if (nf < 0)
fatal(_("NF set to negative value"));
NF = nf;
@@ -409,7 +451,7 @@ re_parse_field(long up_to, /* parse only up to this field number */
sep = scan;
while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
scan++;
- if (sep_arr != NULL && sep < scan)
+ if (sep_arr != NULL && sep < scan)
set_element(nf, sep, (long)(scan - sep), sep_arr);
}
@@ -441,8 +483,8 @@ re_parse_field(long up_to, /* parse only up to this field number */
}
(*set)(++nf, field,
(long)(scan + RESTART(rp, scan) - field), n);
- if (sep_arr != NULL)
- set_element(nf, scan + RESTART(rp, scan),
+ if (sep_arr != NULL)
+ set_element(nf, scan + RESTART(rp, scan),
(long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
scan += REEND(rp, scan);
field = scan;
@@ -506,7 +548,7 @@ def_parse_field(long up_to, /* parse only up to this field number */
sep = scan;
for (; nf < up_to; scan++) {
/*
- * special case: fs is single space, strip leading whitespace
+ * special case: fs is single space, strip leading whitespace
*/
while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
scan++;
@@ -538,75 +580,6 @@ def_parse_field(long up_to, /* parse only up to this field number */
}
/*
- * posix_def_parse_field --- default field parsing.
- *
- * This is called both from get_field() and from do_split()
- * via (*parse_field)(). This variation is for when FS is a single space
- * character. The only difference between this and def_parse_field()
- * is that this one does not allow newlines to separate fields.
- */
-
-static long
-posix_def_parse_field(long up_to, /* parse only up to this field number */
- char **buf, /* on input: string to parse; on output: point to start next */
- int len,
- NODE *fs,
- Regexp *rp ATTRIBUTE_UNUSED,
- Setfunc set, /* routine to set the value of the parsed field */
- NODE *n,
- NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
- bool in_middle ATTRIBUTE_UNUSED)
-{
- char *scan = *buf;
- long nf = parse_high_water;
- char *field;
- char *end = scan + len;
- char sav;
-
- if (up_to == UNLIMITED)
- nf = 0;
- if (len == 0)
- return nf;
-
- /*
- * Nasty special case. If FS set to "", return whole record
- * as first field. This is not worth a separate function.
- */
- if (fs->stlen == 0) {
- (*set)(++nf, *buf, len, n);
- *buf += len;
- return nf;
- }
-
- /* before doing anything save the char at *end */
- sav = *end;
- /* because it will be destroyed now: */
-
- *end = ' '; /* sentinel character */
- for (; nf < up_to; scan++) {
- /*
- * special case: fs is single space, strip leading whitespace
- */
- while (scan < end && (*scan == ' ' || *scan == '\t'))
- scan++;
- if (scan >= end)
- break;
- field = scan;
- while (*scan != ' ' && *scan != '\t')
- scan++;
- (*set)(++nf, field, (long)(scan - field), n);
- if (scan == end)
- break;
- }
-
- /* everything done, restore original char at *end */
- *end = sav;
-
- *buf = scan;
- return nf;
-}
-
-/*
* null_parse_field --- each character is a separate field
*
* This is called both from get_field() and from do_split()
@@ -739,6 +712,31 @@ sc_parse_field(long up_to, /* parse only up to this field number */
}
/*
+ * calc_mbslen --- calculate the length in bytes of a multi-byte string
+ * containing len characters.
+ */
+
+static size_t
+calc_mbslen(char *scan, char *end, size_t len, mbstate_t *mbs)
+{
+
+ size_t mbclen;
+ char *mbscan = scan;
+
+ while (len-- > 0 && mbscan < end) {
+ mbclen = mbrlen(mbscan, end - mbscan, mbs);
+ if (!(mbclen > 0 && mbclen <= (size_t)(end - mbscan)))
+ /*
+ * We treat it as a singlebyte character. This should
+ * catch error codes 0, (size_t) -1, and (size_t) -2.
+ */
+ mbclen = 1;
+ mbscan += mbclen;
+ }
+ return mbscan - scan;
+}
+
+/*
* fw_parse_field --- field parsing using FIELDWIDTHS spec
*
* This is called from get_field() via (*parse_field)().
@@ -758,53 +756,53 @@ fw_parse_field(long up_to, /* parse only up to this field number */
char *scan = *buf;
long nf = parse_high_water;
char *end = scan + len;
- int nmbc;
- size_t mbclen;
- size_t mbslen;
- size_t lenrest;
- char *mbscan;
+ const awk_fieldwidth_info_t *fw;
mbstate_t mbs;
+ size_t skiplen;
+ size_t flen;
- memset(&mbs, 0, sizeof(mbstate_t));
+ fw = (api_parser_override ? api_fw : FIELDWIDTHS);
if (up_to == UNLIMITED)
nf = 0;
if (len == 0)
return nf;
- for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
- if (gawk_mb_cur_max > 1) {
- nmbc = 0;
- mbslen = 0;
- mbscan = scan;
- lenrest = end - scan;
- while (nmbc < len && mbslen < lenrest) {
- mbclen = mbrlen(mbscan, end - mbscan, &mbs);
- if ( mbclen == 1
- || mbclen == (size_t) -1
- || mbclen == (size_t) -2
- || mbclen == 0) {
- /* We treat it as a singlebyte character. */
- mbclen = 1;
- }
- if (mbclen <= end - mbscan) {
- mbscan += mbclen;
- mbslen += mbclen;
- ++nmbc;
- }
- }
- (*set)(++nf, scan, (long) mbslen, n);
- scan += mbslen;
- } else {
- if (len > end - scan)
- len = end - scan;
- (*set)(++nf, scan, (long) len, n);
- scan += len;
+ if (gawk_mb_cur_max > 1 && fw->use_chars) {
+ /*
+ * Reset the shift state. Arguably, the shift state should
+ * be part of the file state and carried forward at all times,
+ * but nobody has complained so far, so this may not matter
+ * in practice.
+ */
+ memset(&mbs, 0, sizeof(mbstate_t));
+ while (nf < up_to && scan < end) {
+ if (nf >= fw->nf) {
+ *buf = end;
+ return nf;
+ }
+ scan += calc_mbslen(scan, end, fw->fields[nf].skip, &mbs);
+ flen = calc_mbslen(scan, end, fw->fields[nf].len, &mbs);
+ (*set)(++nf, scan, (long) flen, n);
+ scan += flen;
+ }
+ } else {
+ while (nf < up_to && scan < end) {
+ if (nf >= fw->nf) {
+ *buf = end;
+ return nf;
+ }
+ skiplen = fw->fields[nf].skip;
+ if (skiplen > end - scan)
+ skiplen = end - scan;
+ scan += skiplen;
+ flen = fw->fields[nf].len;
+ if (flen > end - scan)
+ flen = end - scan;
+ (*set)(++nf, scan, (long) flen, n);
+ scan += flen;
}
}
- if (len == -1)
- *buf = end;
- else
- *buf = scan;
+ *buf = scan;
return nf;
}
@@ -857,11 +855,11 @@ get_field(long requested, Func_ptr *assign)
/*
* Keep things uniform. Also, mere intention of assigning something
* to $n should not make $0 invalid. Makes sense to invalidate $0
- * after the actual assignment is performed. Not a real issue in
+ * after the actual assignment is performed. Not a real issue in
* the interpreter otherwise, but causes problem in the
* debugger when watching or printing fields.
*/
-
+
if (assign != NULL)
*assign = invalidate_field0; /* $0 needs reconstruction */
#endif
@@ -893,7 +891,7 @@ get_field(long requested, Func_ptr *assign)
if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
NF = parse_high_water;
else if (parse_field == fpat_parse_field) {
- /* FPAT parsing is wierd, isolate the special cases */
+ /* FPAT parsing is weird, isolate the special cases */
char *rec_start = fields_arr[0]->stptr;
char *rec_end = fields_arr[0]->stptr + fields_arr[0]->stlen;
@@ -930,7 +928,7 @@ set_element(long num, char *s, long len, NODE *n)
NODE *sub;
it = make_string(s, len);
- it->flags |= MAYBE_NUM;
+ it->flags |= USER_INPUT;
sub = make_number((AWKNUM) (num));
lhs = assoc_lookup(n, sub);
unref(*lhs);
@@ -977,12 +975,12 @@ do_split(int nargs)
if (sep_arr != NULL) {
if (sep_arr == arr)
- fatal(_("split: cannot use the same array for second and fourth args"));
+ fatal(_("split: cannot use the same array for second and fourth args"));
/* This checks need to be done before clearing any of the arrays */
for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
if (tmp == arr)
- fatal(_("split: cannot use a subarray of second arg for fourth arg"));
+ fatal(_("split: cannot use a subarray of second arg for fourth arg"));
for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
if (tmp == sep_arr)
fatal(_("split: cannot use a subarray of fourth arg for second arg"));
@@ -1000,6 +998,9 @@ do_split(int nargs)
return make_number((AWKNUM) 0);
}
+ if ((sep->flags & REGEX) != 0)
+ sep = sep->typed_re;
+
if ( (sep->re_flags & FS_DFLT) != 0
&& current_field_sep() == Using_FS
&& ! RS_is_null) {
@@ -1020,10 +1021,7 @@ do_split(int nargs)
}
} else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
if (fs->stptr[0] == ' ') {
- if (do_posix)
- parseit = posix_def_parse_field;
- else
- parseit = def_parse_field;
+ parseit = def_parse_field;
} else
parseit = sc_parse_field;
} else {
@@ -1065,13 +1063,16 @@ do_patsplit(int nargs)
src = TOP_STRING();
+ if ((sep->flags & REGEX) != 0)
+ sep = sep->typed_re;
+
fpat = sep->re_exp;
if (fpat->stlen == 0)
fatal(_("patsplit: third argument must be non-null"));
if (sep_arr != NULL) {
if (sep_arr == arr)
- fatal(_("patsplit: cannot use the same array for second and fourth args"));
+ fatal(_("patsplit: cannot use the same array for second and fourth args"));
/* These checks need to be done before clearing any of the arrays */
for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
@@ -1102,6 +1103,18 @@ do_patsplit(int nargs)
return tmp;
}
+/* set_parser --- update the current (non-API) parser */
+
+static void
+set_parser(parse_field_func_t func)
+{
+ normal_parse_field = func;
+ if (! api_parser_override && parse_field != func) {
+ parse_field = func;
+ update_PROCINFO_str("FS", current_field_sep_str());
+ }
+}
+
/* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
void
@@ -1123,27 +1136,27 @@ set_FIELDWIDTHS()
return;
/*
- * If changing the way fields are split, obey least-suprise
+ * If changing the way fields are split, obey least-surprise
* semantics, and force $0 to be split totally.
*/
if (fields_arr != NULL)
(void) get_field(UNLIMITED - 1, 0);
- parse_field = fw_parse_field;
+ set_parser(fw_parse_field);
tmp = force_string(FIELDWIDTHS_node->var_value);
scan = tmp->stptr;
- if (FIELDWIDTHS == NULL)
- emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
- FIELDWIDTHS[0] = 0;
- for (i = 1; ; i++) {
+ if (FIELDWIDTHS == NULL) {
+ emalloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
+ FIELDWIDTHS->use_chars = awk_true;
+ }
+ FIELDWIDTHS->nf = 0;
+ for (i = 0; ; i++) {
unsigned long int tmp;
- if (i + 2 >= fw_alloc) {
+ if (i >= fw_alloc) {
fw_alloc *= 2;
- erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
+ erealloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
}
- /* Initialize value to be end of list */
- FIELDWIDTHS[i] = -1;
/* Ensure that there is no leading `-' sign. Otherwise,
strtoul would accept it and return a bogus result. */
while (is_blank(*scan)) {
@@ -1156,19 +1169,47 @@ set_FIELDWIDTHS()
if (*scan == '\0')
break;
- /* Detect an invalid base-10 integer, a valid value that
- is followed by something other than a blank or '\0',
- or a value that is not in the range [1..INT_MAX]. */
+ // Look for skip value. We allow N:M and N:*.
+ /*
+ * Detect an invalid base-10 integer, a valid value that
+ * is followed by something other than a blank or '\0',
+ * or a value that is not in the range [1..UINT_MAX].
+ */
errno = 0;
tmp = strtoul(scan, &end, 10);
+ if (errno == 0 && *end == ':' && (0 < tmp && tmp <= UINT_MAX)) {
+ FIELDWIDTHS->fields[i].skip = tmp;
+ scan = end + 1;
+ if (*scan == '-' || is_blank(*scan)) {
+ fatal_error = true;
+ break;
+ }
+ // try scanning for field width
+ tmp = strtoul(scan, &end, 10);
+ }
+ else
+ FIELDWIDTHS->fields[i].skip = 0;
+
if (errno != 0
|| (*end != '\0' && ! is_blank(*end))
- || !(0 < tmp && tmp <= INT_MAX)
+ || !(0 < tmp && tmp <= UINT_MAX)
) {
- fatal_error = true;
+ if (*scan == '*') {
+ for (scan++; is_blank(*scan); scan++)
+ continue;
+
+ if (*scan != '\0')
+ fatal(_("`*' must be the last designator in FIELDWIDTHS"));
+
+ FIELDWIDTHS->fields[i].len = UINT_MAX;
+ FIELDWIDTHS->nf = i+1;
+ }
+ else
+ fatal_error = true;
break;
}
- FIELDWIDTHS[i] = tmp;
+ FIELDWIDTHS->fields[i].len = tmp;
+ FIELDWIDTHS->nf = i+1;
scan = end;
/* Skip past any trailing blanks. */
while (is_blank(*scan)) {
@@ -1177,12 +1218,10 @@ set_FIELDWIDTHS()
if (*scan == '\0')
break;
}
- FIELDWIDTHS[i+1] = -1;
- update_PROCINFO_str("FS", "FIELDWIDTHS");
if (fatal_error)
- fatal(_("invalid FIELDWIDTHS value, near `%s'"),
- scan);
+ fatal(_("invalid FIELDWIDTHS value, for field %d, near `%s'"),
+ i + 1, scan);
}
/* set_FS --- handle things when FS is assigned to */
@@ -1236,7 +1275,7 @@ set_FS()
* FS_regexp will be NULL with a non-null FS_re_yes_case.
* refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
* Please do not remerge.
- */
+ */
refree(FS_re_yes_case);
refree(FS_re_no_case);
FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
@@ -1250,7 +1289,7 @@ choose_fs_function:
if (! do_traditional && fs->stlen == 0) {
static bool warned = false;
- parse_field = null_parse_field;
+ set_parser(null_parse_field);
if (do_lint && ! warned) {
warned = true;
@@ -1259,10 +1298,10 @@ choose_fs_function:
} else if (fs->stlen > 1) {
if (do_lint_old)
warning(_("old awk does not support regexps as value of `FS'"));
- parse_field = re_parse_field;
+ set_parser(re_parse_field);
} else if (RS_is_null) {
/* we know that fs->stlen <= 1 */
- parse_field = sc_parse_field;
+ set_parser(sc_parse_field);
if (fs->stlen == 1) {
if (fs->stptr[0] == ' ') {
default_FS = true;
@@ -1278,10 +1317,7 @@ choose_fs_function:
}
}
} else {
- if (do_posix)
- parse_field = posix_def_parse_field;
- else
- parse_field = def_parse_field;
+ set_parser(def_parse_field);
if (fs->stlen == 1) {
if (fs->stptr[0] == ' ')
@@ -1290,7 +1326,7 @@ choose_fs_function:
/* same special case */
strcpy(buf, "[\\\\]");
else
- parse_field = sc_parse_field;
+ set_parser(sc_parse_field);
}
}
if (remake_re) {
@@ -1302,7 +1338,7 @@ choose_fs_function:
FS_re_yes_case = make_regexp(buf, strlen(buf), false, true, true);
FS_re_no_case = make_regexp(buf, strlen(buf), true, true, true);
FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
- parse_field = re_parse_field;
+ set_parser(re_parse_field);
} else if (parse_field == re_parse_field) {
FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, false, true, true);
FS_re_no_case = make_regexp(fs->stptr, fs->stlen, true, true, true);
@@ -1318,16 +1354,16 @@ choose_fs_function:
*/
if (fs->stlen == 1 && parse_field == re_parse_field)
FS_regexp = FS_re_yes_case;
-
- update_PROCINFO_str("FS", "FS");
}
-/* current_field_sep --- return what field separator is */
+/* current_field_sep --- return the field separator type */
field_sep_type
current_field_sep()
{
- if (parse_field == fw_parse_field)
+ if (api_parser_override)
+ return Using_API;
+ else if (parse_field == fw_parse_field)
return Using_FIELDWIDTHS;
else if (parse_field == fpat_parse_field)
return Using_FPAT;
@@ -1335,6 +1371,21 @@ current_field_sep()
return Using_FS;
}
+/* current_field_sep_str --- return the field separator type as a string */
+
+const char *
+current_field_sep_str()
+{
+ if (api_parser_override)
+ return "API";
+ else if (parse_field == fw_parse_field)
+ return "FIELDWIDTHS";
+ else if (parse_field == fpat_parse_field)
+ return "FPAT";
+ else
+ return "FS";
+}
+
/* update_PROCINFO_str --- update PROCINFO[sub] with string value */
void
@@ -1421,7 +1472,7 @@ set_FPAT()
set_fpat_function:
fpat = force_string(FPAT_node->var_value);
- parse_field = fpat_parse_field;
+ set_parser(fpat_parse_field);
if (remake_re) {
refree(FPAT_re_yes_case);
@@ -1432,8 +1483,6 @@ set_fpat_function:
FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, true, true, true);
FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
}
-
- update_PROCINFO_str("FS", "FPAT");
}
/*
@@ -1471,101 +1520,65 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs)
* via (*parse_field)(). This variation is for when FPAT is a regular
* expression -- use the value to find field contents.
*
- * This was really hard to get right. It happens to bear many resemblances
- * to issues I had with getting gsub right with null matches. When dealing
- * with that I prototyped in awk and had the foresight to save the awk code
- * over in the C file. Starting with that as a base, I finally got to this
- * awk code to do what I needed, and then translated it into C. Fortunately
- * the C code bears a closer correspondance to the awk code here than over
- * by gsub.
+ * The FPAT parsing logic is a bit difficult to specify. In particular
+ * to allow null fields at certain locations. To make the code as robust
+ * as possible, an awk reference implementation was written and tested
+ * as a first step, and later recoded in C, preserving its structure as
+ * much as possible.
*
- * BEGIN {
- * false = 0
- * true = 1
- *
- * fpat[1] = "([^,]*)|(\"[^\"]+\")"
- * fpat[2] = fpat[1]
- * fpat[3] = fpat[1]
- * fpat[4] = "aa+"
- * fpat[5] = fpat[4]
- *
- * data[1] = "Robbins,,Arnold,"
- * data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
- * data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
- * data[4] = "bbbaaacccdddaaaaaqqqq"
- * data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
+ * # Reference implementation of the FPAT record parsing.
+ * #
+ * # Each loop iteration identifies a (separator[n-1],field[n]) pair.
+ * # Each loop iteration must consume some characters, except for the first field.
+ * # So a null field is only valid as a first field or after a non-null separator.
+ * # A null record has no fields (not a single null field).
*
- * for (i = 1; i in data; i++) {
- * printf("Splitting: <%s>\n", data[i])
- * n = mypatsplit(data[i], fields, fpat[i], seps)
- * print "n =", n
- * for (j = 1; j <= n; j++)
- * printf("fields[%d] = <%s>\n", j, fields[j])
- * for (j = 0; j in seps; j++)
- * printf("seps[%s] = <%s>\n", j, seps[j])
- * }
- * }
- *
- * function mypatsplit(string, array, pattern, seps,
- * eosflag, non_empty, nf) # locals
+ * function refpatsplit(string, fields, pattern, seps,
+ * parse_start, sep_start, field_start, field_length, field_found, nf) # locals
* {
- * delete array
- * delete seps
- * if (length(string) == 0)
- * return 0
- *
- * eosflag = non_empty = false
- * nf = 0
- * while (match(string, pattern)) {
- * if (RLENGTH > 0) { # easy case
- * non_empty = true
- * if (! (nf in seps)) {
- * if (RSTART == 1) # match at front of string
- * seps[nf] = ""
- * else
- * seps[nf] = substr(string, 1, RSTART - 1)
- * }
- * array[++nf] = substr(string, RSTART, RLENGTH)
- * string = substr(string, RSTART+RLENGTH)
- * if (length(string) == 0)
- * break
- * } else if (non_empty) {
- * # last match was non-empty, and at the
- * # current character we get a zero length match,
- * # which we don't want, so skip over it
- * non_empty = false
- * seps[nf] = substr(string, 1, 1)
- * string = substr(string, 2)
- * } else {
- * # 0 length match
- * if (! (nf in seps)) {
- * if (RSTART == 1)
- * seps[nf] = ""
- * else
- * seps[nf] = substr(string, 1, RSTART - 1)
- * }
- * array[++nf] = ""
- * if (! non_empty && ! eosflag) { # prev was empty
- * seps[nf] = substr(string, 1, 1)
- * }
- * if (RSTART == 1) {
- * string = substr(string, 2)
- * } else {
- * string = substr(string, RSTART + 1)
- * }
- * non_empty = false
- * }
- * if (length(string) == 0) {
- * if (eosflag)
- * break
- * else
- * eosflag = true
- * }
- * }
- * if (length(string) > 0)
- * seps[nf] = string
+ * # Local state variables:
+ * # - parse_start: pointer to the first not yet consumed character
+ * # - sep_start: pointer to the beginning of the parsed separator
+ * # - field start: pointer to the beginning of the parsed field
+ * # - field length: length of the parsed field
+ * # - field_found: flag for succesful field match
+ * # - nf: Number of fields found so far
+ *
+ * # Prepare for parsing
+ * parse_start = 1 # first not yet parsed char
+ * nf = 0 # fields found so far
+ * delete fields
+ * delete seps
*
- * return length(array)
+ * # Loop that consumes the whole record
+ * while (parse_start <= length(string)) { # still something to parse
+ *
+ * # first attempt to match the next field
+ * sep_start = parse_start
+ * field_found = match(substr(string, parse_start), pattern)
+ *
+ * # check for an invalid null field and retry one character away
+ * if (nf > 0 && field_found && RSTART==1 && RLENGTH==0) {
+ * parse_start++
+ * field_found = match(substr(string, parse_start), pattern)
+ * }
+ *
+ * # store the (sep[n-1],field[n]) pair
+ * if (field_found) {
+ * field_start = parse_start + RSTART - 1
+ * field_length = RLENGTH
+ * seps[nf] = substr(string, sep_start, field_start-sep_start)
+ * fields[++nf] = substr(string, field_start, field_length)
+ * parse_start = field_start + field_length
+ *
+ * # store the final extra sep after the last field
+ * } else {
+ * seps[nf] = substr(string, sep_start)
+ * parse_start = length(string) + 1
+ * }
+ * }
+ *
+ * return nf
* }
*/
static long
@@ -1584,10 +1597,9 @@ fpat_parse_field(long up_to, /* parse only up to this field number */
char *start;
char *end = scan + len;
int regex_flags = RE_NEED_START;
- bool need_to_set_sep;
- bool non_empty;
- bool eosflag;
mbstate_t mbs;
+ char* field_start;
+ bool field_found = false;
memset(&mbs, 0, sizeof(mbstate_t));
@@ -1600,90 +1612,55 @@ fpat_parse_field(long up_to, /* parse only up to this field number */
if (rp == NULL) /* use FPAT */
rp = FPAT_regexp;
- if (in_middle) {
- regex_flags |= RE_NO_BOL;
- }
- non_empty = rp->non_empty;
+ while (scan < end && nf < up_to) { /* still something to parse */
- eosflag = false;
- need_to_set_sep = true;
- start = scan;
- while (research(rp, scan, 0, (end - scan), regex_flags) != -1
- && nf < up_to) {
+ /* first attempt to match the next field */
+ start = scan;
+ field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
- if (REEND(rp, scan) > RESTART(rp, scan)) { /* if (RLENGTH > 0) */
- non_empty = true;
- if (sep_arr != NULL && need_to_set_sep) {
- if (RESTART(rp, scan) == 0) /* match at front */
- set_element(nf, start, 0L, sep_arr);
+ /* check for an invalid null field and retry one character away */
+ if (nf > 0 && field_found && REEND(rp, scan) == 0) { /* invalid null field */
+ increment_scan(& scan, end - scan);
+ field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
+ }
+
+ /* store the (sep[n-1],field[n]) pair */
+ if (field_found) {
+ field_start = scan + RESTART(rp, scan);
+ if (sep_arr != NULL) { /* store the separator */
+ if (field_start == start) /* match at front */
+ set_element(nf, start, 0L, sep_arr);
else
- set_element(nf,
+ set_element(nf,
start,
- (long) RESTART(rp, scan),
+ (long) (field_start - start),
sep_arr);
}
/* field is text that matched */
(*set)(++nf,
- scan + RESTART(rp, scan),
+ field_start,
(long)(REEND(rp, scan) - RESTART(rp, scan)),
n);
-
scan += REEND(rp, scan);
- if (scan >= end)
- break;
- need_to_set_sep = true;
- } else if (non_empty) { /* else if non_empty */
- /*
- * last match was non-empty, and at the
- * current character we get a zero length match,
- * which we don't want, so skip over it
- */
- non_empty = false;
- if (sep_arr != NULL) {
- need_to_set_sep = false;
- set_element(nf, start, 1L, sep_arr);
- }
- increment_scan(& scan, end - scan);
+
} else {
- /* 0 length match */
- if (sep_arr != NULL && need_to_set_sep) {
- if (RESTART(rp, scan) == 0) /* RSTART == 1 */
- set_element(nf, start, 0L, sep_arr);
- else
- set_element(nf, start,
- (long) RESTART(rp, scan),
- sep_arr);
- }
- need_to_set_sep = true;
- (*set)(++nf, scan, 0L, n);
- if (! non_empty && ! eosflag) { /* prev was empty */
- if (sep_arr != NULL) {
- set_element(nf, start, 1L, sep_arr);
- need_to_set_sep = false;
- }
- }
- if (RESTART(rp, scan) == 0)
- increment_scan(& scan, end - scan);
- else {
- scan += RESTART(rp, scan);
- }
- non_empty = false;
- }
- if (scan >= end) { /* length(string) == 0 */
- if (eosflag)
- break;
- else
- eosflag = true;
+ /*
+ * No match, store the final extra separator after
+ * the last field.
+ */
+ if (sep_arr != NULL)
+ set_element(nf, start, (long) (end - start), sep_arr);
+ scan = end;
}
-
- start = scan;
- }
- if (scan < end) {
- if (sep_arr != NULL)
- set_element(nf, scan, (long) (end - scan), sep_arr);
}
+ /*
+ * If the last field extends up to the end of the record, generate
+ * a null trailing separator
+ */
+ if (sep_arr != NULL && scan == end && field_found)
+ set_element(nf, scan, 0L, sep_arr);
+
*buf = scan;
- rp->non_empty = non_empty;
return nf;
}