diff options
Diffstat (limited to 'field.c')
-rw-r--r-- | field.c | 222 |
1 files changed, 87 insertions, 135 deletions
@@ -2,22 +2,22 @@ * field.c - routines for dealing with fields and record parsing */ -/* +/* * Copyright (C) 1986, 1988, 1989, 1991-2016 the Free Software Foundation, Inc. - * + * * This file is part of GAWK, the GNU implementation of the * AWK Programming Language. - * + * * GAWK is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. - * + * * GAWK is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA @@ -44,8 +44,6 @@ static long re_parse_field(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); static long def_parse_field(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); -static long posix_def_parse_field(long, char **, int, NODE *, - Regexp *, Setfunc, NODE *, NODE *, bool); static long null_parse_field(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); static long sc_parse_field(long, char **, int, NODE *, @@ -57,6 +55,7 @@ static long fpat_parse_field(long, char **, int, NODE *, static void set_element(long num, char * str, long len, NODE *arr); static void grow_fields_arr(long num); static void set_field(long num, char *str, long len, NODE *dummy); +static void purge_record(void); static char *parse_extent; /* marks where to restart parse of record */ static long parse_high_water = 0; /* field number that we have parsed so far */ @@ -95,7 +94,7 @@ init_fields() getnode(Null_field); *Null_field = *Nnull_string; Null_field->valref = 1; - Null_field->flags = (FIELD|STRCUR|STRING|NULL_FIELD); + Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */ field0_valid = true; } @@ -133,7 +132,7 @@ set_field(long num, n = fields_arr[num]; n->stptr = str; n->stlen = len; - n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD); + n->flags = (STRCUR|STRING|USER_INPUT); /* do not set MALLOC */ } /* rebuild_record --- Someone assigned a value to $(something). @@ -163,7 +162,7 @@ rebuild_record() tlen += (NF - 1) * OFSlen; if ((long) tlen < 0) tlen = 0; - emalloc(ops, char *, tlen + 2, "rebuild_record"); + emalloc(ops, char *, tlen + 1, "rebuild_record"); cops = ops; ops[0] = '\0'; for (i = 1; i <= NF; i++) { @@ -196,29 +195,32 @@ rebuild_record() */ for (cops = ops, i = 1; i <= NF; i++) { NODE *r = fields_arr[i]; - if (r->stlen > 0) { + /* + * There is no reason to copy malloc'ed fields to point into + * the new $0 buffer, although that's how previous versions did + * it. It seems faster to leave the malloc'ed fields in place. + */ + if (r->stlen > 0 && (r->flags & MALLOC) == 0) { NODE *n; getnode(n); - if ((r->flags & FIELD) == 0) { - *n = *Null_field; - n->stlen = r->stlen; - if ((r->flags & (NUMCUR|NUMBER)) != 0) { - n->flags |= (r->flags & (MPFN|MPZN|NUMCUR|NUMBER)); -#ifdef HAVE_MPFR - if (is_mpg_float(r)) { - mpfr_init(n->mpg_numbr); - mpfr_set(n->mpg_numbr, r->mpg_numbr, ROUND_MODE); - } else if (is_mpg_integer(r)) { - mpz_init(n->mpg_i); - mpz_set(n->mpg_i, r->mpg_i); - } else -#endif - n->numbr = r->numbr; - } - } else { - *n = *r; - n->flags &= ~(MALLOC|STRING); + *n = *r; + if (r->valref > 1) { + /* + * This probably never happens, since it + * was not considered by previous versions of + * this function. But it seems clear that + * we can't leave r's stptr pointing into the + * old $0 buffer that we are about to unref. + * It's not a priori obvious that valref must be + * 1 in all cases, so it seems wise to suppport + * this corner case. The only question is + * whether to add a warning message. + */ + emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record"); + memcpy(r->stptr, cops, r->stlen); + r->stptr[r->stlen] = '\0'; + r->flags |= MALLOC; } n->stptr = cops; @@ -229,6 +231,10 @@ rebuild_record() cops += fields_arr[i]->stlen + OFSlen; } + assert((fields_arr[0]->flags & MALLOC) == 0 + ? fields_arr[0]->valref == 1 + : true); + unref(fields_arr[0]); fields_arr[0] = tmp; @@ -254,7 +260,7 @@ set_record(const char *buf, int cnt) #define INITIAL_SIZE 512 #define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */ - reset_record(); + purge_record(); /* buffer management: */ if (databuf_size == 0) { /* first time */ @@ -269,8 +275,11 @@ set_record(const char *buf, int cnt) * databuf_size is > cnt after allocation. */ if (cnt >= databuf_size) { - while (cnt >= databuf_size && databuf_size <= MAX_SIZE) + do { + if (databuf_size > MAX_SIZE/2) + fatal(_("input record too large")); databuf_size *= 2; + } while (cnt >= databuf_size); erealloc(databuf, char *, databuf_size, "set_record"); memset(databuf, '\0', databuf_size); } @@ -278,20 +287,24 @@ set_record(const char *buf, int cnt) memcpy(databuf, buf, cnt); /* - * Add terminating '\0' so that C library routines + * Add terminating '\0' so that C library routines * will know when to stop. */ databuf[cnt] = '\0'; /* manage field 0: */ + assert((fields_arr[0]->flags & MALLOC) == 0 + ? fields_arr[0]->valref == 1 + : true); + unref(fields_arr[0]); getnode(n); n->stptr = databuf; n->stlen = cnt; n->valref = 1; n->type = Node_val; - n->stfmt = -1; - n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD); + n->stfmt = STFMT_UNUSED; + n->flags = (STRING|STRCUR|USER_INPUT); /* do not set MALLOC */ fields_arr[0] = n; #undef INITIAL_SIZE @@ -303,13 +316,21 @@ set_record(const char *buf, int cnt) void reset_record() { + fields_arr[0] = force_string(fields_arr[0]); + purge_record(); +} + +static void +purge_record() +{ int i; NODE *n; - fields_arr[0] = force_string(fields_arr[0]); - NF = -1; for (i = 1; i <= parse_high_water; i++) { + assert((fields_arr[i]->flags & MALLOC) == 0 + ? fields_arr[i]->valref == 1 + : true); unref(fields_arr[i]); getnode(n); *n = *Null_field; @@ -341,7 +362,7 @@ set_NF() assert(NF != -1); (void) force_number(NF_node->var_value); - nf = get_number_si(NF_node->var_value); + nf = get_number_si(NF_node->var_value); if (nf < 0) fatal(_("NF set to negative value")); NF = nf; @@ -409,7 +430,7 @@ re_parse_field(long up_to, /* parse only up to this field number */ sep = scan; while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n')) scan++; - if (sep_arr != NULL && sep < scan) + if (sep_arr != NULL && sep < scan) set_element(nf, sep, (long)(scan - sep), sep_arr); } @@ -441,8 +462,8 @@ re_parse_field(long up_to, /* parse only up to this field number */ } (*set)(++nf, field, (long)(scan + RESTART(rp, scan) - field), n); - if (sep_arr != NULL) - set_element(nf, scan + RESTART(rp, scan), + if (sep_arr != NULL) + set_element(nf, scan + RESTART(rp, scan), (long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr); scan += REEND(rp, scan); field = scan; @@ -506,7 +527,7 @@ def_parse_field(long up_to, /* parse only up to this field number */ sep = scan; for (; nf < up_to; scan++) { /* - * special case: fs is single space, strip leading whitespace + * special case: fs is single space, strip leading whitespace */ while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n')) scan++; @@ -538,75 +559,6 @@ def_parse_field(long up_to, /* parse only up to this field number */ } /* - * posix_def_parse_field --- default field parsing. - * - * This is called both from get_field() and from do_split() - * via (*parse_field)(). This variation is for when FS is a single space - * character. The only difference between this and def_parse_field() - * is that this one does not allow newlines to separate fields. - */ - -static long -posix_def_parse_field(long up_to, /* parse only up to this field number */ - char **buf, /* on input: string to parse; on output: point to start next */ - int len, - NODE *fs, - Regexp *rp ATTRIBUTE_UNUSED, - Setfunc set, /* routine to set the value of the parsed field */ - NODE *n, - NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */ - bool in_middle ATTRIBUTE_UNUSED) -{ - char *scan = *buf; - long nf = parse_high_water; - char *field; - char *end = scan + len; - char sav; - - if (up_to == UNLIMITED) - nf = 0; - if (len == 0) - return nf; - - /* - * Nasty special case. If FS set to "", return whole record - * as first field. This is not worth a separate function. - */ - if (fs->stlen == 0) { - (*set)(++nf, *buf, len, n); - *buf += len; - return nf; - } - - /* before doing anything save the char at *end */ - sav = *end; - /* because it will be destroyed now: */ - - *end = ' '; /* sentinel character */ - for (; nf < up_to; scan++) { - /* - * special case: fs is single space, strip leading whitespace - */ - while (scan < end && (*scan == ' ' || *scan == '\t')) - scan++; - if (scan >= end) - break; - field = scan; - while (*scan != ' ' && *scan != '\t') - scan++; - (*set)(++nf, field, (long)(scan - field), n); - if (scan == end) - break; - } - - /* everything done, restore original char at *end */ - *end = sav; - - *buf = scan; - return nf; -} - -/* * null_parse_field --- each character is a separate field * * This is called both from get_field() and from do_split() @@ -857,11 +809,11 @@ get_field(long requested, Func_ptr *assign) /* * Keep things uniform. Also, mere intention of assigning something * to $n should not make $0 invalid. Makes sense to invalidate $0 - * after the actual assignment is performed. Not a real issue in + * after the actual assignment is performed. Not a real issue in * the interpreter otherwise, but causes problem in the * debugger when watching or printing fields. */ - + if (assign != NULL) *assign = invalidate_field0; /* $0 needs reconstruction */ #endif @@ -930,7 +882,7 @@ set_element(long num, char *s, long len, NODE *n) NODE *sub; it = make_string(s, len); - it->flags |= MAYBE_NUM; + it->flags |= USER_INPUT; sub = make_number((AWKNUM) (num)); lhs = assoc_lookup(n, sub); unref(*lhs); @@ -977,12 +929,12 @@ do_split(int nargs) if (sep_arr != NULL) { if (sep_arr == arr) - fatal(_("split: cannot use the same array for second and fourth args")); + fatal(_("split: cannot use the same array for second and fourth args")); /* This checks need to be done before clearing any of the arrays */ for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array) if (tmp == arr) - fatal(_("split: cannot use a subarray of second arg for fourth arg")); + fatal(_("split: cannot use a subarray of second arg for fourth arg")); for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array) if (tmp == sep_arr) fatal(_("split: cannot use a subarray of fourth arg for second arg")); @@ -1000,6 +952,9 @@ do_split(int nargs) return make_number((AWKNUM) 0); } + if ((sep->flags & REGEX) != 0) + sep = sep->typed_re; + if ( (sep->re_flags & FS_DFLT) != 0 && current_field_sep() == Using_FS && ! RS_is_null) { @@ -1020,10 +975,7 @@ do_split(int nargs) } } else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) { if (fs->stptr[0] == ' ') { - if (do_posix) - parseit = posix_def_parse_field; - else - parseit = def_parse_field; + parseit = def_parse_field; } else parseit = sc_parse_field; } else { @@ -1065,13 +1017,16 @@ do_patsplit(int nargs) src = TOP_STRING(); + if ((sep->flags & REGEX) != 0) + sep = sep->typed_re; + fpat = sep->re_exp; if (fpat->stlen == 0) fatal(_("patsplit: third argument must be non-null")); if (sep_arr != NULL) { if (sep_arr == arr) - fatal(_("patsplit: cannot use the same array for second and fourth args")); + fatal(_("patsplit: cannot use the same array for second and fourth args")); /* These checks need to be done before clearing any of the arrays */ for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array) @@ -1138,7 +1093,7 @@ set_FIELDWIDTHS() FIELDWIDTHS[0] = 0; for (i = 1; ; i++) { unsigned long int tmp; - if (i + 2 >= fw_alloc) { + if (i + 1 >= fw_alloc) { fw_alloc *= 2; erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS"); } @@ -1165,7 +1120,7 @@ set_FIELDWIDTHS() || (*end != '\0' && ! is_blank(*end)) || !(0 < tmp && tmp <= INT_MAX) ) { - fatal_error = true; + fatal_error = true; break; } FIELDWIDTHS[i] = tmp; @@ -1236,7 +1191,7 @@ set_FS() * FS_regexp will be NULL with a non-null FS_re_yes_case. * refree() handles null argument; no need for `if (FS_regexp != NULL)' below. * Please do not remerge. - */ + */ refree(FS_re_yes_case); refree(FS_re_no_case); FS_re_yes_case = FS_re_no_case = FS_regexp = NULL; @@ -1278,10 +1233,7 @@ choose_fs_function: } } } else { - if (do_posix) - parse_field = posix_def_parse_field; - else - parse_field = def_parse_field; + parse_field = def_parse_field; if (fs->stlen == 1) { if (fs->stptr[0] == ' ') @@ -1482,19 +1434,19 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs) * BEGIN { * false = 0 * true = 1 - * + * * fpat[1] = "([^,]*)|(\"[^\"]+\")" * fpat[2] = fpat[1] * fpat[3] = fpat[1] * fpat[4] = "aa+" * fpat[5] = fpat[4] - * + * * data[1] = "Robbins,,Arnold," * data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA" * data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA" * data[4] = "bbbaaacccdddaaaaaqqqq" * data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa - * + * * for (i = 1; i in data; i++) { * printf("Splitting: <%s>\n", data[i]) * n = mypatsplit(data[i], fields, fpat[i], seps) @@ -1505,7 +1457,7 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs) * printf("seps[%s] = <%s>\n", j, seps[j]) * } * } - * + * * function mypatsplit(string, array, pattern, seps, * eosflag, non_empty, nf) # locals * { @@ -1513,7 +1465,7 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs) * delete seps * if (length(string) == 0) * return 0 - * + * * eosflag = non_empty = false * nf = 0 * while (match(string, pattern)) { @@ -1564,7 +1516,7 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs) * } * if (length(string) > 0) * seps[nf] = string - * + * * return length(array) * } */ @@ -1637,7 +1589,7 @@ fpat_parse_field(long up_to, /* parse only up to this field number */ * last match was non-empty, and at the * current character we get a zero length match, * which we don't want, so skip over it - */ + */ non_empty = false; if (sep_arr != NULL) { need_to_set_sep = false; |