aboutsummaryrefslogtreecommitdiffstats
path: root/field.c
diff options
context:
space:
mode:
authorAndrew J. Schorr <aschorr@telemetry-investments.com>2017-03-21 13:22:18 -0400
committerAndrew J. Schorr <aschorr@telemetry-investments.com>2017-03-21 13:22:18 -0400
commitd1bebd3cbf60fa25883271512cf63e0c3275e3ef (patch)
tree517a5e996e033120e73afbb4c265528909271fbc /field.c
parent489349d84fa92f69b2066240bc202a4f2777c465 (diff)
downloadegawk-d1bebd3cbf60fa25883271512cf63e0c3275e3ef.tar.gz
egawk-d1bebd3cbf60fa25883271512cf63e0c3275e3ef.tar.bz2
egawk-d1bebd3cbf60fa25883271512cf63e0c3275e3ef.zip
Enhance FIELDWIDTHS syntax to support a skip prefix, and unify logic with API field parsing.
Diffstat (limited to 'field.c')
-rw-r--r--field.c196
1 files changed, 92 insertions, 104 deletions
diff --git a/field.c b/field.c
index 5ef4d74b..bd333c92 100644
--- a/field.c
+++ b/field.c
@@ -38,6 +38,8 @@ is_blank(int c)
typedef void (* Setfunc)(long, char *, long, NODE *);
+/* is the API currently overriding the default parsing mechanism? */
+static bool api_parser_override = false;
static long (*parse_field)(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
/*
@@ -57,9 +59,7 @@ static long sc_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
static long fw_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
-static long api_parse_field(long, char **, int, NODE *,
- Regexp *, Setfunc, NODE *, NODE *, bool);
-static const awk_input_field_info_t *api_fw = NULL;
+static const awk_fieldwidth_info_t *api_fw = NULL;
static long fpat_parse_field(long, char **, int, NODE *,
Regexp *, Setfunc, NODE *, NODE *, bool);
static void set_element(long num, char * str, long len, NODE *arr);
@@ -74,7 +74,7 @@ static bool resave_fs;
static NODE *save_FS; /* save current value of FS when line is read,
* to be used in deferred parsing
*/
-static int *FIELDWIDTHS = NULL;
+static awk_fieldwidth_info_t *FIELDWIDTHS = NULL;
NODE **fields_arr; /* array of pointers to the field nodes */
bool field0_valid; /* $(>0) has not been changed yet */
@@ -262,7 +262,7 @@ rebuild_record()
* but better correct than fast.
*/
void
-set_record(const char *buf, int cnt, const awk_input_field_info_t *fw)
+set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *fw)
{
NODE *n;
static char *databuf;
@@ -318,11 +318,13 @@ set_record(const char *buf, int cnt, const awk_input_field_info_t *fw)
fields_arr[0] = n;
if (fw != api_fw) {
if ((api_fw = fw) != NULL) {
- if (parse_field != api_parse_field) {
- parse_field = api_parse_field;
+ if (! api_parser_override) {
+ api_parser_override = true;
+ parse_field = fw_parse_field;
update_PROCINFO_str("FS", "API");
}
- } else if (parse_field != normal_parse_field) {
+ } else if (api_parser_override) {
+ api_parser_override = false;
parse_field = normal_parse_field;
update_PROCINFO_str("FS", current_field_sep_str());
}
@@ -712,83 +714,38 @@ sc_parse_field(long up_to, /* parse only up to this field number */
}
/*
- * fw_parse_field --- field parsing using FIELDWIDTHS spec
- *
- * This is called from get_field() via (*parse_field)().
- * This variation is for fields are fixed widths.
+ * calc_mbslen --- calculate the length in bytes of a multi-byte string
+ * containing len characters.
*/
-static long
-fw_parse_field(long up_to, /* parse only up to this field number */
- char **buf, /* on input: string to parse; on output: point to start next */
- int len,
- NODE *fs ATTRIBUTE_UNUSED,
- Regexp *rp ATTRIBUTE_UNUSED,
- Setfunc set, /* routine to set the value of the parsed field */
- NODE *n,
- NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
- bool in_middle ATTRIBUTE_UNUSED)
+
+static size_t
+calc_mbslen(char *scan, char *end, size_t len, mbstate_t *mbs)
{
- char *scan = *buf;
- long nf = parse_high_water;
- char *end = scan + len;
- int nmbc;
- size_t mbclen;
- size_t mbslen;
- size_t lenrest;
- char *mbscan;
- mbstate_t mbs;
- memset(&mbs, 0, sizeof(mbstate_t));
+ size_t mbclen;
+ char *mbscan = scan;
- if (up_to == UNLIMITED)
- nf = 0;
- if (len == 0)
- return nf;
- for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
- if (gawk_mb_cur_max > 1) {
- nmbc = 0;
- mbslen = 0;
- mbscan = scan;
- lenrest = end - scan;
- while (nmbc < len && mbslen < lenrest) {
- mbclen = mbrlen(mbscan, end - mbscan, &mbs);
- if ( mbclen == 1
- || mbclen == (size_t) -1
- || mbclen == (size_t) -2
- || mbclen == 0) {
- /* We treat it as a singlebyte character. */
- mbclen = 1;
- }
- if (mbclen <= end - mbscan) {
- mbscan += mbclen;
- mbslen += mbclen;
- ++nmbc;
- }
- }
- (*set)(++nf, scan, (long) mbslen, n);
- scan += mbslen;
- } else {
- if (len > end - scan)
- len = end - scan;
- (*set)(++nf, scan, (long) len, n);
- scan += len;
- }
+ while (len-- > 0 && mbscan < end) {
+ mbclen = mbrlen(mbscan, end - mbscan, mbs);
+ if (!(mbclen > 0 && mbclen <= (size_t)(end - mbscan)))
+ /*
+ * We treat it as a singlebyte character. This should
+ * catch error codes 0, (size_t) -1, and (size_t) -2.
+ */
+ mbclen = 1;
+ mbscan += mbclen;
}
- if (len == -1)
- *buf = end;
- else
- *buf = scan;
- return nf;
+ return mbscan - scan;
}
/*
- * api_parse_field --- field parsing using field widths returned by API parser.
+ * fw_parse_field --- field parsing using FIELDWIDTHS spec
*
* This is called from get_field() via (*parse_field)().
+ * This variation is for fields are fixed widths.
*/
-
static long
-api_parse_field(long up_to, /* parse only up to this field number */
+fw_parse_field(long up_to, /* parse only up to this field number */
char **buf, /* on input: string to parse; on output: point to start next */
int len,
NODE *fs ATTRIBUTE_UNUSED,
@@ -801,26 +758,50 @@ api_parse_field(long up_to, /* parse only up to this field number */
char *scan = *buf;
long nf = parse_high_water;
char *end = scan + len;
- int skiplen;
+ const awk_fieldwidth_info_t *fw;
+ mbstate_t mbs;
+ size_t skiplen;
size_t flen;
+ fw = (api_parser_override ? api_fw : FIELDWIDTHS);
+
if (up_to == UNLIMITED)
nf = 0;
if (len == 0)
return nf;
- while (nf < up_to) {
- if ((skiplen = api_fw[nf].skip) < 0) {
- *buf = end;
- return nf;
+ if (gawk_mb_cur_max > 1 && fw->use_chars) {
+ /*
+ * XXX This may be a bug. Most likely, shift state should
+ * persist across all fields in a record, if not across record
+ * boundaries as well.
+ */
+ memset(&mbs, 0, sizeof(mbstate_t));
+ while (nf < up_to) {
+ if (nf >= fw->nf) {
+ *buf = end;
+ return nf;
+ }
+ scan += calc_mbslen(scan, end, fw->fields[nf].skip, &mbs);
+ flen = calc_mbslen(scan, end, fw->fields[nf].len, &mbs);
+ (*set)(++nf, scan, (long) flen, n);
+ scan += flen;
+ }
+ } else {
+ while (nf < up_to) {
+ if (nf >= fw->nf) {
+ *buf = end;
+ return nf;
+ }
+ skiplen = fw->fields[nf].skip;
+ if (skiplen > end - scan)
+ skiplen = end - scan;
+ scan += skiplen;
+ flen = fw->fields[nf].len;
+ if (flen > end - scan)
+ flen = end - scan;
+ (*set)(++nf, scan, (long) flen, n);
+ scan += flen;
}
- if (skiplen > end - scan)
- skiplen = end - scan;
- scan += skiplen;
- flen = api_fw[nf].len;
- if (flen > end - scan)
- flen = end - scan;
- (*set)(++nf, scan, (long) flen, n);
- scan += flen;
}
*buf = scan;
return nf;
@@ -1129,7 +1110,7 @@ static void
set_parser(long (*func)(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool))
{
normal_parse_field = func;
- if (parse_field != api_parse_field && parse_field != func) {
+ if (! api_parser_override && parse_field != func) {
parse_field = func;
update_PROCINFO_str("FS", current_field_sep_str());
}
@@ -1156,7 +1137,7 @@ set_FIELDWIDTHS()
return;
/*
- * If changing the way fields are split, obey least-suprise
+ * If changing the way fields are split, obey least-surprise
* semantics, and force $0 to be split totally.
*/
if (fields_arr != NULL)
@@ -1166,17 +1147,17 @@ set_FIELDWIDTHS()
tmp = force_string(FIELDWIDTHS_node->var_value);
scan = tmp->stptr;
- if (FIELDWIDTHS == NULL)
- emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
- FIELDWIDTHS[0] = 0;
- for (i = 1; ; i++) {
+ if (FIELDWIDTHS == NULL) {
+ emalloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
+ FIELDWIDTHS->use_chars = awk_true;
+ }
+ FIELDWIDTHS->nf = 0;
+ for (i = 0; ; i++) {
unsigned long int tmp;
- if (i + 1 >= fw_alloc) {
+ if (i >= fw_alloc) {
fw_alloc *= 2;
- erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
+ erealloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
}
- /* Initialize value to be end of list */
- FIELDWIDTHS[i] = -1;
/* Ensure that there is no leading `-' sign. Otherwise,
strtoul would accept it and return a bogus result. */
while (is_blank(*scan)) {
@@ -1194,6 +1175,13 @@ set_FIELDWIDTHS()
or a value that is not in the range [1..INT_MAX]. */
errno = 0;
tmp = strtoul(scan, &end, 10);
+ if (errno == 0 && *end == ':' && (0 < tmp && tmp <= INT_MAX)) {
+ FIELDWIDTHS->fields[i].skip = tmp;
+ scan = end + 1;
+ tmp = strtoul(scan, &end, 10);
+ }
+ else
+ FIELDWIDTHS->fields[i].skip = 0;
if (errno != 0
|| (*end != '\0' && ! is_blank(*end))
|| !(0 < tmp && tmp <= INT_MAX)
@@ -1201,7 +1189,8 @@ set_FIELDWIDTHS()
fatal_error = true;
break;
}
- FIELDWIDTHS[i] = tmp;
+ FIELDWIDTHS->fields[i].len = tmp;
+ FIELDWIDTHS->nf = i+1;
scan = end;
/* Skip past any trailing blanks. */
while (is_blank(*scan)) {
@@ -1210,7 +1199,6 @@ set_FIELDWIDTHS()
if (*scan == '\0')
break;
}
- FIELDWIDTHS[i+1] = -1;
if (fatal_error)
fatal(_("invalid FIELDWIDTHS value, near `%s'"),
@@ -1354,12 +1342,12 @@ choose_fs_function:
field_sep_type
current_field_sep()
{
- if (parse_field == fw_parse_field)
+ if (api_parser_override)
+ return Using_API;
+ else if (parse_field == fw_parse_field)
return Using_FIELDWIDTHS;
else if (parse_field == fpat_parse_field)
return Using_FPAT;
- else if (parse_field == api_parse_field)
- return Using_API;
else
return Using_FS;
}
@@ -1369,12 +1357,12 @@ current_field_sep()
const char *
current_field_sep_str()
{
- if (parse_field == fw_parse_field)
+ if (api_parser_override)
+ return "API";
+ else if (parse_field == fw_parse_field)
return "FIELDWIDTHS";
else if (parse_field == fpat_parse_field)
return "FPAT";
- else if (parse_field == api_parse_field)
- return "API";
else
return "FS";
}