diff options
-rw-r--r-- | ChangeLog | 168 | ||||
-rw-r--r-- | array.c | 2 | ||||
-rw-r--r-- | awk.h | 65 | ||||
-rw-r--r-- | builtin.c | 7 | ||||
-rw-r--r-- | eval.c | 6 | ||||
-rw-r--r-- | field.c | 78 | ||||
-rw-r--r-- | gawkapi.c | 55 | ||||
-rw-r--r-- | gawkapi.h | 6 | ||||
-rw-r--r-- | int_array.c | 6 | ||||
-rw-r--r-- | interpret.h | 23 | ||||
-rw-r--r-- | io.c | 1 | ||||
-rw-r--r-- | mpfr.c | 3 | ||||
-rw-r--r-- | node.c | 10 | ||||
-rw-r--r-- | str_array.c | 8 | ||||
-rw-r--r-- | test/ChangeLog | 20 | ||||
-rw-r--r-- | test/Makefile.am | 14 | ||||
-rw-r--r-- | test/Makefile.in | 29 | ||||
-rw-r--r-- | test/Maketests | 15 | ||||
-rw-r--r-- | test/apiterm.awk | 8 | ||||
-rw-r--r-- | test/apiterm.in | 1 | ||||
-rw-r--r-- | test/apiterm.ok | 3 | ||||
-rw-r--r-- | test/arrayind3.awk | 19 | ||||
-rw-r--r-- | test/arrayind3.ok | 2 | ||||
-rw-r--r-- | test/fldterm.awk | 10 | ||||
-rw-r--r-- | test/fldterm.in | 1 | ||||
-rw-r--r-- | test/fldterm.ok | 2 | ||||
-rw-r--r-- | test/forcenum.awk | 6 | ||||
-rw-r--r-- | test/forcenum.ok | 6 | ||||
-rw-r--r-- | test/rebuild.in | 2 | ||||
-rw-r--r-- | test/rebuild.ok | 2 |
30 files changed, 461 insertions, 117 deletions
@@ -291,13 +291,179 @@ * eval.c (set_LINT): Reset lintfunc to `warning' for LINT="invalid". Thanks to Andy Schorr for the report. +2016-07-08 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * awk.h: Restore previous comment about unterminated strings, since + I am removing the string termination patches from field.c + (free_api_string_copies): Declare new gawkapi function. + * builtin.c (do_mktime, do_system): Restore temporary string + termination to protect against unterminated field values. + (nondec2awknum): Remove comment about unnecessary termination. + * eval.c (posix_compare): Restore temporary string termination. + * field.c (databuf): Remove struct no longer needed. + (set_field): Remove memcpy for string termination, since we will support + unterminated field string values. + (rebuild_record): Ditto. Also no need to allocate space for terminated + string copies. + (allocate_databuf): Remove function, since memory management can again + be done inside set_record. + (set_record): Restore inline buffer management logic. + (reset_record): Remove calls to allocate_databuf, since we no longer + need space for making terminated copies of field strings. + * gawkapi.c (free_api_string_copies): New function to free strings + that we made to provide terminated copies to API functions. + (assign_string): New function to convert a string to an awk_value, + making sure to copy it if we need to terminate it. + (node_to_awk_value): Use assign_string to return string values with + NUL termination protection. + * int_array.c (is_integer): Restore temporary string termination. + * interpret.h (Op_push_i): Ditto. + (Op_ext_builtin): After external function returns, call + free_api_string_copies to free temporary string copies. + * mpfr.c (force_mpnum): Restore temporary string termination. + * node.c (r_force_number, get_ieee_magic_val): Ditto. + 2016-07-08 Arnold D. Robbins <arnold@skeeve.com> * dfa.c: Sync with GNU grep. Unrelated: - * builtin.c (do_print): Coding style change. + * builtin.c (do_print): Coding style change. (This change obsoleted + by earlier changes in the fixtype branch.) + +2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * awk.h: Modify comments to indicate that MAYBE_NUM will now be + left enabled to indicate strnum values by the NUMBER|MAYBE_NUM + combination, whereas STRING|MAYBE_NUM indicates a potential strnum. + (fixtype): Modify MAYBE_NUM test to avoid calling force_number if + NUMCUR is already set. + * builtin.c (do_typeof): Call fixtype to resolve argument type. + This forces parsing of numeric strings, so there's a performance + penalty, but we must do this to give a correct result. The meaning + of "strnum" changes from "potential strnum" to "actual strnum". + * eval.c (set_TEXTDOMAIN): Remove some dead code left over from last + patch. + * int_array.c (is_integer): When a MAYBE_NUM is converted successfully + to a NUMBER, leave the MAYBE_NUM flag enabled. + * mpfr.c (mpg_force_number): Ditto. + * node.c (r_force_number): Ditto. + +2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * awk.h: Modify stptr comment to indicate that all strings are now + NUL-terminated. + * builtin.c (do_mktime): Remove unnecessary logic to terminate + the string with '\0' temporarily. + (do_system) Ditto. + (nondec2awknum): Add a comment about termination. + * eval.c (posix_compare): Remove logic to terminate strings temporarily. + (set_ORS): No need to terminate ORS, since the string node is already + terminated. What gave us the right to modify that node anyway? + (fmt_index): Remove code to terminate string. This seems to have been + invalid anyway, since we don't own that memory. + (set_TEXTDOMAIN): Do not terminate TEXTDOMAIN string, since the node + is already terminated. We didn't have the right to modify that node + anyway. + * gawkapi.c (node_to_awk_value): Add assert checks to confirm that the + string is NUL-terminated. + * gawkapi.h: Modify awk_string comment to indicate that strings are + always terminated with '\0'. + * int_array.c (isinteger): Remove unnecessary logic to terminate string + with '\0' temporarily. + * interpret.h (Op_push_i): Ditto. + * io.c (nextfile): Remove string termination. We didn't own that memory + anyway. + * mpfr.c (force_mpnum): Remove unnecessary logic to terminate the + string with '\0' temporarily. + * node.c (r_force_number): Remove NUL termination around strtod call, + since we already know that there is either a white space or '\0' + character there. Either one will stop strtod. + (get_ieee_magic_val): Ditto. + * profile.c (pp_number): No need to terminate string returned by + r_format_val. + +2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * interpret.h (Op_field_spec): Now that all $n field values are + NUL-terminated, there is no reason to call dupnode for $n where n > 0. + This saves malloc and copying overhead, thereby more than offsetting the + performance hit of the additional copying and NUL-termination in the + last patch to field.c. It also eliminates repeated parsing in cases + where $n, for n > 1, was accessed more than once in a numeric context, + so the new approach should be a performance win. + +2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com> + + Make sure that all field values, and therefore all strings inside gawk, + are terminated with a '\0' character! + * field.c (databuf): New static struct to hold info about our buffer to + contain the field string values. + (allocate_databuf): New function to make sure the databuf is large + enough to hold $0 and copies of $1 through $NF. + (set_field): Copy $n into free space previously allocated in databuf + and add a '\0' at the end. + (rebuild_record): Call allocate_databuf to ensure sufficient space + for copying non-malloced field values. When copying field values, + use databuf to create a NUL-terminated copy. + (purge_record): New function extracted from reset_record to initialize + $1 through $NF to null values. + (set_record): Buffer management moved to new allocate_databuf function. + Call purge_record instead of reset_record, since reset_record contains + some extra logic not needed in this case. + (reset_record): Call purge_record to do most of the work, and call + allocate_databuf to make sure we have a big enough buffer to contain + copies of the $1 through $NF. + +2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * awk.h: Renumber flags to remove gap created when FIELD was removed. + +2016-07-05 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * field.c (rebuild_record): Need to set MALLOC flag if we allocate + memory for a residual field node with valref > 1. + +2016-07-05 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * field.c (rebuild_record): Do not bother to create new field nodes + to replace malloc'ed nodes when rebuilding $0. + +2016-07-05 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * awk.h (FIELD): Remove unnecessary flag. + (MALLOC): Move definition to join the others, and improve the comment. + * array.c (value_info): Replace FIELD test with MALLOC test. + * eval.c (flags2str): Remove FIELD flag. + * field.c (init_fields): Remove FIELD bit from Null_field->flags. + (set_field): Remove FIELD bit from flags. + (rebuild_record): Test against MALLOC instead of FIELD. If a field + node has valref > 1, we should make a copy, although I don't think + it is valid for this to happen. + (set_record): Remove FIELD bit from flags. + * interpret.h (UNFIELD): Add comment, and test MALLOC flag instead of + FIELD. Remove probably buggy code to disable the FIELD flag when + valref is 1; that would have created a node where neither the FIELD + nor MALLOC flag was set, which seems invalid. + * node.c (r_dupnode): Remove code disabling FIELD flag. + +2016-07-04 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * awk.h (force_string_fmt): New inline function to get the string + representation in a requested format. + (force_string): Reimplement as a macro using force_string_fmt function. + (force_string_ofmt): New macro to get a value's OFMT representation. + * builtin.c (do_print): Use new force_string_ofmt macro instead of + duplicating the logic inline. + +2016-07-04 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * str_array.c (str_lookup): There is no need to worry about the + MAYBE_NUM flag, since the code has been patched to make sure to + preserve the string value of strnum values, and the integer array + code should no longer mistakenly claim a strnum integer with a + nonstandard string representation. 2016-07-03 Andrew J. Schorr <aschorr@telemetry-investments.com> @@ -696,7 +696,7 @@ value_info(NODE *n) fprintf(output_fp, ":%s", flags2str(n->flags)); - if ((n->flags & FIELD) == 0) + if ((n->flags & MALLOC) != 0) fprintf(output_fp, ":%ld", n->valref); else fprintf(output_fp, ":"); @@ -389,9 +389,6 @@ typedef struct exp_node { NODETYPE type; unsigned int flags; -/* any type */ -# define MALLOC 0x0001 /* can be free'd */ - /* type = Node_val */ /* * STRING and NUMBER are mutually exclusive, except for the special @@ -411,14 +408,16 @@ typedef struct exp_node { * b = a + 0 # Adds NUMCUR to a, since numeric value * # is now available. But the type hasn't changed! * - * MAYBE_NUM is the joker. It means "this is string data, but - * the user may have really wanted it to be a number. If we have - * to guess, like in a comparison, turn it into a number if the string - * is indeed numeric." + * MAYBE_NUM is the joker. When STRING|MAYBE_NUM is set, it means + * "this is string data, but the user may have really wanted it to be a + * number. If we have to guess, like in a comparison, turn it into a + * number if the string is indeed numeric." * For example, gawk -v a=42 .... * Here, `a' gets STRING|STRCUR|MAYBE_NUM and then when used where * a number is needed, it gets turned into a NUMBER and STRING - * is cleared. + * is cleared. In that case, we leave the MAYBE_NUM in place, so + * the combination NUMBER|MAYBE_NUM means it is a strnum a.k.a. a + * "numeric string". * * WSTRCUR is for efficiency. If in a multibyte locale, and we * need to do something character based (substr, length, etc.) @@ -436,30 +435,30 @@ typedef struct exp_node { * * We hope that the rest of the flags are self-explanatory. :-) */ +# define MALLOC 0x0001 /* stptr can be free'd, i.e. not a field node pointing into a shared buffer */ # define STRING 0x0002 /* assigned as string */ # define STRCUR 0x0004 /* string value is current */ # define NUMCUR 0x0008 /* numeric value is current */ # define NUMBER 0x0010 /* assigned as number */ # define MAYBE_NUM 0x0020 /* user input: if NUMERIC then * a NUMBER */ -# define FIELD 0x0040 /* this is a field */ -# define INTLSTR 0x0080 /* use localized version */ -# define NUMINT 0x0100 /* numeric value is an integer */ -# define INTIND 0x0200 /* integral value is array index; +# define INTLSTR 0x0040 /* use localized version */ +# define NUMINT 0x0080 /* numeric value is an integer */ +# define INTIND 0x0100 /* integral value is array index; * lazy conversion to string. */ -# define WSTRCUR 0x0400 /* wide str value is current */ -# define MPFN 0x0800 /* arbitrary-precision floating-point number */ -# define MPZN 0x1000 /* arbitrary-precision integer */ -# define NO_EXT_SET 0x2000 /* extension cannot set a value for this variable */ -# define NULL_FIELD 0x4000 /* this is the null field */ +# define WSTRCUR 0x0200 /* wide str value is current */ +# define MPFN 0x0400 /* arbitrary-precision floating-point number */ +# define MPZN 0x0800 /* arbitrary-precision integer */ +# define NO_EXT_SET 0x1000 /* extension cannot set a value for this variable */ +# define NULL_FIELD 0x2000 /* this is the null field */ /* type = Node_var_array */ -# define ARRAYMAXED 0x8000 /* array is at max size */ -# define HALFHAT 0x10000 /* half-capacity Hashed Array Tree; +# define ARRAYMAXED 0x4000 /* array is at max size */ +# define HALFHAT 0x8000 /* half-capacity Hashed Array Tree; * See cint_array.c */ -# define XARRAY 0x20000 -# define NUMCONSTSTR 0x40000 /* have string value for numeric constant */ +# define XARRAY 0x10000 +# define NUMCONSTSTR 0x20000 /* have string value for numeric constant */ } NODE; #define vname sub.nodep.name @@ -493,6 +492,7 @@ typedef struct exp_node { * to '\0'. This is helpful when calling functions such as strtod that require * a NUL-terminated argument. In particular, field values $n for n > 0 and * n < NF will not have a NUL terminator, since they point into the $0 buffer. + * All other strings are NUL-terminated. */ #define stptr sub.val.sp #define stlen sub.val.slen @@ -1517,6 +1517,7 @@ extern void update_ext_api(void); extern NODE *awk_value_to_node(const awk_value_t *); extern void run_ext_exit_handlers(int exitval); extern void print_ext_versions(void); +extern void free_api_string_copies(void); /* gawkmisc.c */ extern char *gawk_name(const char *filespec); @@ -1812,18 +1813,30 @@ dupnode(NODE *n) } #endif -/* force_string --- force a node to have a string value */ +/* + * force_string_fmt --- force a node to have a string value in a given format. + * The string representation of a number may change due to whether it was most + * recently rendered with CONVFMT or OFMT, or due to changes in the CONVFMT + * and OFMT values. But if the value entered gawk as a string or strnum, then + * stfmt should be set to STFMT_UNUSED, and the string representation should + * not change. + */ static inline NODE * -force_string(NODE *s) +force_string_fmt(NODE *s, const char *fmtstr, int fmtidx) { if ((s->flags & STRCUR) != 0 - && (s->stfmt == STFMT_UNUSED || s->stfmt == CONVFMTidx) + && (s->stfmt == STFMT_UNUSED || s->stfmt == fmtidx) ) return s; - return format_val(CONVFMT, CONVFMTidx, s); + return format_val(fmtstr, fmtidx, s); } +/* conceptually should be force_string_convfmt, but this is the typical case */ +#define force_string(s) force_string_fmt((s), CONVFMT, CONVFMTidx) + +#define force_string_ofmt(s) force_string_fmt((s), OFMT, OFMTidx) + #ifdef GAWKDEBUG #define unref r_unref #define force_number str2number @@ -1869,7 +1882,7 @@ fixtype(NODE *n) { assert(n->type == Node_val); if (n->type == Node_val) { - if ((n->flags & MAYBE_NUM) != 0) + if ((n->flags & (NUMCUR|MAYBE_NUM)) == MAYBE_NUM) return force_number(n); if ((n->flags & INTIND) != 0) return force_string(n); @@ -2190,11 +2190,10 @@ do_print(int nargs, int redirtype) DEREF(args_array[i]); fatal(_("attempt to use array `%s' in a scalar context"), array_vname(tmp)); } - if ( (tmp->flags & STRCUR) == 0 || ( tmp->stfmt != STFMT_UNUSED && tmp->stfmt != OFMTidx)) - args_array[i] = format_val(OFMT, OFMTidx, tmp); + args_array[i] = force_string_ofmt(tmp); } if (redir_exp != NULL) { @@ -3944,14 +3943,14 @@ do_typeof(int nargs) break; case Node_val: case Node_var: - switch (arg->flags & (STRING|NUMBER|MAYBE_NUM)) { + switch (fixtype(arg)->flags & (STRING|NUMBER|MAYBE_NUM)) { case STRING: res = "string"; break; case NUMBER: res = "number"; break; - case STRING|MAYBE_NUM: + case NUMBER|MAYBE_NUM: res = "strnum"; break; case NUMBER|STRING: @@ -437,7 +437,6 @@ flags2str(int flagval) { NUMCUR, "NUMCUR" }, { NUMBER, "NUMBER" }, { MAYBE_NUM, "MAYBE_NUM" }, - { FIELD, "FIELD" }, { INTLSTR, "INTLSTR" }, { NUMINT, "NUMINT" }, { INTIND, "INTIND" }, @@ -827,7 +826,6 @@ set_ORS() ORS_node->var_value = force_string(ORS_node->var_value); ORS = ORS_node->var_value->stptr; ORSlen = ORS_node->var_value->stlen; - ORS[ORSlen] = '\0'; } /* fmt_ok --- is the conversion format a valid one? */ @@ -890,7 +888,6 @@ fmt_index(NODE *n) ix++; } /* not found */ - n->stptr[n->stlen] = '\0'; if (do_lint && ! fmt_ok(n)) lintwarn(_("bad `%sFMT' specification `%s'"), n == CONVFMT_node->var_value ? "CONV" @@ -970,13 +967,10 @@ set_LINT() void set_TEXTDOMAIN() { - int len; NODE *tmp; tmp = TEXTDOMAIN_node->var_value = force_string(TEXTDOMAIN_node->var_value); TEXTDOMAIN = tmp->stptr; - len = tmp->stlen; - TEXTDOMAIN[len] = '\0'; /* * Note: don't call textdomain(); this value is for * the awk program, not for gawk itself. @@ -55,6 +55,7 @@ static long fpat_parse_field(long, char **, int, NODE *, static void set_element(long num, char * str, long len, NODE *arr); static void grow_fields_arr(long num); static void set_field(long num, char *str, long len, NODE *dummy); +static void purge_record(void); static char *parse_extent; /* marks where to restart parse of record */ static long parse_high_water = 0; /* field number that we have parsed so far */ @@ -93,7 +94,7 @@ init_fields() getnode(Null_field); *Null_field = *Nnull_string; Null_field->valref = 1; - Null_field->flags = (FIELD|STRCUR|STRING|NULL_FIELD); + Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */ field0_valid = true; } @@ -131,7 +132,7 @@ set_field(long num, n = fields_arr[num]; n->stptr = str; n->stlen = len; - n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD); + n->flags = (STRCUR|STRING|MAYBE_NUM); /* do not set MALLOC */ } /* rebuild_record --- Someone assigned a value to $(something). @@ -194,29 +195,32 @@ rebuild_record() */ for (cops = ops, i = 1; i <= NF; i++) { NODE *r = fields_arr[i]; - if (r->stlen > 0) { + /* + * There is no reason to copy malloc'ed fields to point into + * the new $0 buffer, although that's how previous versions did + * it. It seems faster to leave the malloc'ed fields in place. + */ + if (r->stlen > 0 && (r->flags & MALLOC) == 0) { NODE *n; getnode(n); - if ((r->flags & FIELD) == 0) { - *n = *Null_field; - n->stlen = r->stlen; - if ((r->flags & (NUMCUR|NUMBER)) != 0) { - n->flags |= (r->flags & (MPFN|MPZN|NUMCUR|NUMBER)); -#ifdef HAVE_MPFR - if (is_mpg_float(r)) { - mpfr_init(n->mpg_numbr); - mpfr_set(n->mpg_numbr, r->mpg_numbr, ROUND_MODE); - } else if (is_mpg_integer(r)) { - mpz_init(n->mpg_i); - mpz_set(n->mpg_i, r->mpg_i); - } else -#endif - n->numbr = r->numbr; - } - } else { - *n = *r; - n->flags &= ~MALLOC; + *n = *r; + if (r->valref > 1) { + /* + * This probably never happens, since it + * was not considered by previous versions of + * this function. But it seems clear that + * we can't leave r's stptr pointing into the + * old $0 buffer that we are about to unref. + * It's not a priori obvious that valref must be + * 1 in all cases, so it seems wise to suppport + * this corner case. The only question is + * whether to add a warning message. + */ + emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record"); + memcpy(r->stptr, cops, r->stlen); + r->stptr[r->stlen] = '\0'; + r->flags |= MALLOC; } n->stptr = cops; @@ -227,6 +231,10 @@ rebuild_record() cops += fields_arr[i]->stlen + OFSlen; } +#ifndef NDEBUG + if ((fields_arr[0]->flags & MALLOC) == 0) + assert(fields_arr[0]->valref == 1); +#endif unref(fields_arr[0]); fields_arr[0] = tmp; @@ -252,7 +260,7 @@ set_record(const char *buf, int cnt) #define INITIAL_SIZE 512 #define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */ - reset_record(); + purge_record(); /* buffer management: */ if (databuf_size == 0) { /* first time */ @@ -267,8 +275,11 @@ set_record(const char *buf, int cnt) * databuf_size is > cnt after allocation. */ if (cnt >= databuf_size) { - while (cnt >= databuf_size && databuf_size <= MAX_SIZE) + do { + if (databuf_size > MAX_SIZE/2) + fatal(_("input record too large")); databuf_size *= 2; + } while (cnt >= databuf_size); erealloc(databuf, char *, databuf_size, "set_record"); memset(databuf, '\0', databuf_size); } @@ -282,6 +293,10 @@ set_record(const char *buf, int cnt) databuf[cnt] = '\0'; /* manage field 0: */ +#ifndef NDEBUG + if ((fields_arr[0]->flags & MALLOC) == 0) + assert(fields_arr[0]->valref == 1); +#endif unref(fields_arr[0]); getnode(n); n->stptr = databuf; @@ -289,7 +304,7 @@ set_record(const char *buf, int cnt) n->valref = 1; n->type = Node_val; n->stfmt = STFMT_UNUSED; - n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD); + n->flags = (STRING|STRCUR|MAYBE_NUM); /* do not set MALLOC */ fields_arr[0] = n; #undef INITIAL_SIZE @@ -301,13 +316,22 @@ set_record(const char *buf, int cnt) void reset_record() { + fields_arr[0] = force_string(fields_arr[0]); + purge_record(); +} + +static void +purge_record() +{ int i; NODE *n; - fields_arr[0] = force_string(fields_arr[0]); - NF = -1; for (i = 1; i <= parse_high_water; i++) { +#ifndef NDEBUG + if ((fields_arr[i]->flags & MALLOC) == 0) + assert(fields_arr[i]->valref == 1); +#endif unref(fields_arr[i]); getnode(n); *n = *Null_field; @@ -391,6 +391,52 @@ api_awk_atexit(awk_ext_id_t id, list_head = p; } +static struct { + char **strings; + size_t i, size; +} scopy; + +void +free_api_string_copies() +{ + size_t i; + + for (i = 0; i < scopy.i; i++) + free(scopy.strings[i]); + scopy.i = 0; +} + +/* return a node string with nul termination */ + +static inline void +assign_string(NODE *node, awk_value_t *val) +{ + val->val_type = AWK_STRING; + if (node->stptr[node->stlen] != '\0') { + /* + * This is an unterminated field string, so make a copy. + * This should happen only for $n where n > 0 and n < NF. + */ + char *s; + assert((node->flags & MALLOC) == 0); + if (scopy.i == scopy.size) { + /* expand list */ + if (scopy.size == 0) + scopy.size = 8; /* initial size */ + else + scopy.size *= 2; + erealloc(scopy.strings, char **, scopy.size * sizeof(char *), "assign_string"); + } + emalloc(s, char *, node->stlen + 1, "assign_string"); + memcpy(s, node->stptr, node->stlen); + s[node->stlen] = '\0'; + val->str_value.str = scopy.strings[scopy.i++] = s; + } + else + val->str_value.str = node->stptr; + val->str_value.len = node->stlen; +} + /* node_to_awk_value --- convert a node into a value for an extension */ static awk_bool_t @@ -435,11 +481,8 @@ node_to_awk_value(NODE *node, awk_value_t *val, awk_valtype_t wanted) break; case AWK_STRING: - val->val_type = AWK_STRING; - (void) force_string(node); - val->str_value.str = node->stptr; - val->str_value.len = node->stlen; + assign_string(node, val); ret = awk_true; break; @@ -465,9 +508,7 @@ node_to_awk_value(NODE *node, awk_value_t *val, awk_valtype_t wanted) val->num_value = get_number_d(node); ret = awk_true; } else if ((node->flags & STRING) != 0) { - val->val_type = AWK_STRING; - val->str_value.str = node->stptr; - val->str_value.len = node->stlen; + assign_string(node, val); ret = awk_true; } else val->val_type = AWK_UNDEFINED; @@ -279,11 +279,7 @@ enum { * be multibyte encoded in the current locale's encoding and character * set. Gawk will convert internally to wide characters if necessary. * - * Note that the string may not be terminated with a '\0' character. - * In particular, this happens for field values $n where n > 0 and n < NF, - * since the string points directly into the $0 buffer. All other strings, - * including those created by extensions, should be NUL-terminated. In general - * though, extension code should not assume that the string is NUL-terminated! + * Note that the string will always be terminated with a '\0' character. */ typedef struct awk_string { char *str; /* data */ diff --git a/int_array.c b/int_array.c index 2ab68eeb..0014a81f 100644 --- a/int_array.c +++ b/int_array.c @@ -184,7 +184,8 @@ is_integer(NODE *symbol, NODE *subs) if (len == 1 && *cp != '-') { /* single digit */ subs->numbr = (long) (*cp - '0'); if ((subs->flags & MAYBE_NUM) != 0) { - subs->flags &= ~(MAYBE_NUM|STRING); + /* leave MAYBE_NUM set */ + subs->flags &= ~STRING; subs->flags |= NUMBER; } subs->flags |= (NUMCUR|NUMINT); @@ -203,7 +204,8 @@ is_integer(NODE *symbol, NODE *subs) subs->numbr = l; if ((subs->flags & MAYBE_NUM) != 0) { - subs->flags &= ~(MAYBE_NUM|STRING); + /* leave MAYBE_NUM set */ + subs->flags &= ~STRING; subs->flags |= NUMBER; } subs->flags |= NUMCUR; diff --git a/interpret.h b/interpret.h index bba3cbb5..56d2e060 100644 --- a/interpret.h +++ b/interpret.h @@ -23,13 +23,19 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ +/* + * If "r" is a field, valref should normally be > 1, because the field is + * created initially with valref 1, and valref should be bumped when it is + * pushed onto the stack by Op_field_spec. On the other hand, if we are + * assigning to $n, then Op_store_field calls unref(*lhs) before assigning + * the new value, so that decrements valref. So if the RHS is a field with + * valref 1, that effectively means that this is an assignment like "$n = $n", + * so a no-op, other than triggering $0 reconstitution. + */ #define UNFIELD(l, r) \ { \ /* if was a field, turn it into a var */ \ - if ((r->flags & FIELD) == 0) { \ - l = r; \ - } else if (r->valref == 1) { \ - r->flags &= ~FIELD; \ + if ((r->flags & MALLOC) != 0 || r->valref == 1) { \ l = r; \ } else { \ l = dupnode(r); \ @@ -357,12 +363,8 @@ uninitialized_scalar: lhs = r_get_field(t1, (Func_ptr *) 0, true); decr_sp(); DEREF(t1); - /* only for $0, up ref count */ - if (*lhs == fields_arr[0]) { - r = *lhs; - UPREF(r); - } else - r = dupnode(*lhs); + r = *lhs; + UPREF(r); PUSH(r); break; @@ -964,6 +966,7 @@ arrayfor: if (t1->type == Node_val) DEREF(t1); } + free_api_string_copies(); PUSH(r); } break; @@ -480,7 +480,6 @@ nextfile(IOBUF **curfile, bool skipping) if (arg == NULL || arg->stlen == 0) continue; arg = force_string(arg); - arg->stptr[arg->stlen] = '\0'; if (! do_traditional) { unref(ARGIND_node->var_value); ARGIND_node->var_value = make_number((AWKNUM) i); @@ -349,7 +349,8 @@ mpg_force_number(NODE *n) if (force_mpnum(n, (do_non_decimal_data && ! do_traditional), true)) { if ((n->flags & MAYBE_NUM) != 0) { - n->flags &= ~(MAYBE_NUM|STRING); + /* leave MAYBE_NUM set to indicate a strnum */ + n->flags &= ~STRING; n->flags |= NUMBER; } } else @@ -67,9 +67,9 @@ r_force_number(NODE *n) return n; /* - * We should always set NUMCUR and clear MAYBE_NUM, and we may possibly - * change STRING to NUMBER if MAYBE_NUM was set and it's a good numeric - * string. + * We should always set NUMCUR. If MAYBE_NUM is set and it's a + * numeric string, we clear STRING and enable NUMBER, but if it's not + * numeric, we disable MAYBE_NUM. */ /* All the conditionals are an attempt to avoid the expensive strtod */ @@ -164,7 +164,8 @@ badnum: goodnum: if ((n->flags & MAYBE_NUM) != 0) { - n->flags &= ~(MAYBE_NUM|STRING); + /* leave MAYBE_NUM enabled to indicate that this is a strnum */ + n->flags &= ~STRING; n->flags |= NUMBER; } return n; @@ -300,7 +301,6 @@ r_dupnode(NODE *n) getnode(r); *r = *n; - r->flags &= ~FIELD; r->flags |= MALLOC; r->valref = 1; /* diff --git a/str_array.c b/str_array.c index 65e0b741..d832380d 100644 --- a/str_array.c +++ b/str_array.c @@ -164,11 +164,9 @@ str_lookup(NODE *symbol, NODE *subs) * "Array indices are always strings." * "Array indices are always strings." * .... - * If subs is a STRNUM, copy it; don't clear the MAYBE_NUM - * flag on it since other variables could be using the same - * reference-counted value. */ - if (subs->stfmt != STFMT_UNUSED || (subs->flags & MAYBE_NUM) != 0) { + if (subs->stfmt != STFMT_UNUSED) { + /* The string was generated using CONVFMT. */ NODE *tmp; /* @@ -199,8 +197,6 @@ str_lookup(NODE *symbol, NODE *subs) subs = dupnode(subs); } - assert((subs->flags & MAYBE_NUM) == 0); - getbucket(b); b->ahnext = symbol->buckets[hash1]; symbol->buckets[hash1] = b; diff --git a/test/ChangeLog b/test/ChangeLog index 10809205..06e77da9 100644 --- a/test/ChangeLog +++ b/test/ChangeLog @@ -82,6 +82,26 @@ the report. * clos1way6.ok2: New file. +2016-07-08 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * Makefile.am (apiterm, fldterm): New tests to make sure that we + are handling unterminated field string values properly. + * apiterm.awk, apiterm.in, apiterm.ok: New files. + * fldterm.awk, fldterm.in, fldterm.ok: New files. + +2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * forcenum.awk: We no longer need to force the strnum conversion, + since typeof now does this automatically. + * forcenum.ok: Change "number" to "strnum" for the numeric strings. + * rebuild.in: Change input to include a strnum. + * rebuild.ok: Update results. + +2016-07-04 Andrew J. Schorr <aschorr@telemetry-investments.com> + + * Makefile.am (arrayind3): New test. + * arrayind3.awk, arrayind3.ok: New files. + 2016-07-03 Andrew J. Schorr <aschorr@telemetry-investments.com> * Makefile.am (rebuild): New test. diff --git a/test/Makefile.am b/test/Makefile.am index 9dbedb35..b0bfc128 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -50,6 +50,9 @@ EXTRA_DIST = \ anchor.awk \ anchor.in \ anchor.ok \ + apiterm.awk \ + apiterm.in \ + apiterm.ok \ argarray.awk \ argarray.in \ argarray.ok \ @@ -60,6 +63,8 @@ EXTRA_DIST = \ arrayind1.ok \ arrayind2.awk \ arrayind2.ok \ + arrayind3.awk \ + arrayind3.ok \ arrayparm.awk \ arrayparm.ok \ arrayprm2.awk \ @@ -276,6 +281,9 @@ EXTRA_DIST = \ fldchgnf.awk \ fldchgnf.in \ fldchgnf.ok \ + fldterm.awk \ + fldterm.in \ + fldterm.ok \ fmtspcl-mpfr.ok \ fmtspcl.awk \ fmtspcl.tok \ @@ -1150,7 +1158,7 @@ CLEANFILES = core core.* fmtspcl.ok # try to keep these sorted. each letter starts a new line BASIC_TESTS = \ - addcomma anchgsub anchor argarray arrayind1 arrayind2 arrayparm arrayprm2 arrayprm3 \ + addcomma anchgsub anchor argarray arrayind1 arrayind2 arrayind3 arrayparm arrayprm2 arrayprm3 \ arrayref arrymem1 arryref2 arryref3 arryref4 arryref5 arynasty \ arynocls aryprm1 aryprm2 aryprm3 aryprm4 aryprm5 aryprm6 aryprm7 \ aryprm8 aryprm9 arysubnm asgext awkpath \ @@ -1159,7 +1167,7 @@ BASIC_TESTS = \ concat3 concat4 convfmt \ datanonl defref delargv delarpm2 delarprm delfunc dfamb1 dfastress dynlj \ eofsplit exit2 exitval1 exitval2 exitval3 \ - fcall_exit fcall_exit2 fldchg fldchgnf fnamedat fnarray fnarray2 \ + fcall_exit fcall_exit2 fldchg fldchgnf fldterm fnamedat fnarray fnarray2 \ fnaryscl fnasgnm fnmisc fordel forref forsimp fsbs fsnul1 fsrs fsspcoln \ fstabplus funsemnl funsmnam funstack \ getline getline2 getline3 getline4 getline5 getlnbuf getnr2tb getnr2tm \ @@ -1245,7 +1253,7 @@ LOCALE_CHARSET_TESTS = \ mbprintf1 mbprintf2 mbprintf3 mbprintf4 rebt8b2 rtlenmb sort1 sprintfc SHLIB_TESTS = \ - fnmatch filefuncs fork fork2 fts functab4 getfile inplace1 inplace2 inplace3 \ + apiterm fnmatch filefuncs fork fork2 fts functab4 getfile inplace1 inplace2 inplace3 \ ordchr ordchr2 readdir readfile readfile2 revout revtwoway rwarray testext time # List of the tests which should be run with --lint option: diff --git a/test/Makefile.in b/test/Makefile.in index f1ffc0db..1c4514bb 100644 --- a/test/Makefile.in +++ b/test/Makefile.in @@ -307,6 +307,9 @@ EXTRA_DIST = \ anchor.awk \ anchor.in \ anchor.ok \ + apiterm.awk \ + apiterm.in \ + apiterm.ok \ argarray.awk \ argarray.in \ argarray.ok \ @@ -317,6 +320,8 @@ EXTRA_DIST = \ arrayind1.ok \ arrayind2.awk \ arrayind2.ok \ + arrayind3.awk \ + arrayind3.ok \ arrayparm.awk \ arrayparm.ok \ arrayprm2.awk \ @@ -533,6 +538,9 @@ EXTRA_DIST = \ fldchgnf.awk \ fldchgnf.in \ fldchgnf.ok \ + fldterm.awk \ + fldterm.in \ + fldterm.ok \ fmtspcl-mpfr.ok \ fmtspcl.awk \ fmtspcl.tok \ @@ -1406,7 +1414,7 @@ CLEANFILES = core core.* fmtspcl.ok # try to keep these sorted. each letter starts a new line BASIC_TESTS = \ - addcomma anchgsub anchor argarray arrayind1 arrayind2 arrayparm arrayprm2 arrayprm3 \ + addcomma anchgsub anchor argarray arrayind1 arrayind2 arrayind3 arrayparm arrayprm2 arrayprm3 \ arrayref arrymem1 arryref2 arryref3 arryref4 arryref5 arynasty \ arynocls aryprm1 aryprm2 aryprm3 aryprm4 aryprm5 aryprm6 aryprm7 \ aryprm8 aryprm9 arysubnm asgext awkpath \ @@ -1415,7 +1423,7 @@ BASIC_TESTS = \ concat3 concat4 convfmt \ datanonl defref delargv delarpm2 delarprm delfunc dfamb1 dfastress dynlj \ eofsplit exit2 exitval1 exitval2 exitval3 \ - fcall_exit fcall_exit2 fldchg fldchgnf fnamedat fnarray fnarray2 \ + fcall_exit fcall_exit2 fldchg fldchgnf fldterm fnamedat fnarray fnarray2 \ fnaryscl fnasgnm fnmisc fordel forref forsimp fsbs fsnul1 fsrs fsspcoln \ fstabplus funsemnl funsmnam funstack \ getline getline2 getline3 getline4 getline5 getlnbuf getnr2tb getnr2tm \ @@ -1497,7 +1505,7 @@ LOCALE_CHARSET_TESTS = \ mbprintf1 mbprintf2 mbprintf3 mbprintf4 rebt8b2 rtlenmb sort1 sprintfc SHLIB_TESTS = \ - fnmatch filefuncs fork fork2 fts functab4 getfile inplace1 inplace2 inplace3 \ + apiterm fnmatch filefuncs fork fork2 fts functab4 getfile inplace1 inplace2 inplace3 \ ordchr ordchr2 readdir readfile readfile2 revout revtwoway rwarray testext time @@ -2818,6 +2826,11 @@ arrayind2: @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +arrayind3: + @echo $@ + @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + arrayparm: @echo $@ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @@ -3068,6 +3081,11 @@ fldchgnf: @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +fldterm: + @echo $@ + @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + fnamedat: @echo $@ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @@ -4364,6 +4382,11 @@ sprintfc: @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +apiterm: + @echo $@ + @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + fnmatch: @echo $@ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ diff --git a/test/Maketests b/test/Maketests index a13ed244..ebdf6901 100644 --- a/test/Maketests +++ b/test/Maketests @@ -25,6 +25,11 @@ arrayind2: @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +arrayind3: + @echo $@ + @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + arrayparm: @echo $@ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @@ -275,6 +280,11 @@ fldchgnf: @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +fldterm: + @echo $@ + @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + fnamedat: @echo $@ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @@ -1571,6 +1581,11 @@ sprintfc: @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +apiterm: + @echo $@ + @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + fnmatch: @echo $@ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ diff --git a/test/apiterm.awk b/test/apiterm.awk new file mode 100644 index 00000000..95e4b120 --- /dev/null +++ b/test/apiterm.awk @@ -0,0 +1,8 @@ +@load "filefuncs" + +{ + print $1 + # check whether API terminates field strings properly + print chdir($1) + print ERRNO +} diff --git a/test/apiterm.in b/test/apiterm.in new file mode 100644 index 00000000..c4732514 --- /dev/null +++ b/test/apiterm.in @@ -0,0 +1 @@ +. fubar diff --git a/test/apiterm.ok b/test/apiterm.ok new file mode 100644 index 00000000..ef4043be --- /dev/null +++ b/test/apiterm.ok @@ -0,0 +1,3 @@ +. +0 + diff --git a/test/arrayind3.awk b/test/arrayind3.awk new file mode 100644 index 00000000..ca4c58b5 --- /dev/null +++ b/test/arrayind3.awk @@ -0,0 +1,19 @@ +BEGIN { + # initialize cint arrays + pos[0] = 0 + posout[0] = 0 + split("00000779770060", f) # f[1] is a strnum + pos[f[1]] = 1 # subscripts must be strings! + for (x in pos) { + # if x is a strnum, then the + # x != 0 test may convert it to an integral NUMBER, + # and we might lose the unusual string representation + # if the cint code is not careful to recognize that this is + # actually a string + if (x != 0) + posout[x] = pos[x] + } + # which array element is populated? + print posout[779770060] + print posout["00000779770060"] +} diff --git a/test/arrayind3.ok b/test/arrayind3.ok new file mode 100644 index 00000000..a464d9da --- /dev/null +++ b/test/arrayind3.ok @@ -0,0 +1,2 @@ + +1 diff --git a/test/fldterm.awk b/test/fldterm.awk new file mode 100644 index 00000000..26fe01fb --- /dev/null +++ b/test/fldterm.awk @@ -0,0 +1,10 @@ +BEGIN { + # choose a field separator that is numeric, so we can test whether + # force_number properly handles unterminated numeric field strings + FS = "3" +} + +{ + print $1+0 + print $1 +} diff --git a/test/fldterm.in b/test/fldterm.in new file mode 100644 index 00000000..14a41cae --- /dev/null +++ b/test/fldterm.in @@ -0,0 +1 @@ +5.53apple diff --git a/test/fldterm.ok b/test/fldterm.ok new file mode 100644 index 00000000..ecd7600e --- /dev/null +++ b/test/fldterm.ok @@ -0,0 +1,2 @@ +5.5 +5.5 diff --git a/test/forcenum.awk b/test/forcenum.awk index 54c536c9..1a7ddce7 100644 --- a/test/forcenum.awk +++ b/test/forcenum.awk @@ -1,8 +1,6 @@ BEGIN { - # first, make some strnums + # make some strnums nf = split("|5apple|+NaN| 6|0x1az|011Q|027", f, "|") - for (i = 1; i <= nf; i++) { - x = f[i]+0 # trigger strnum conversion to number or string + for (i = 1; i <= nf; i++) printf "[%s] -> %g (type %s)\n", f[i], f[i], typeof(f[i]) - } } diff --git a/test/forcenum.ok b/test/forcenum.ok index c74eefc7..a379db62 100644 --- a/test/forcenum.ok +++ b/test/forcenum.ok @@ -1,7 +1,7 @@ [] -> 0 (type string) [5apple] -> 5 (type string) -[+NaN] -> nan (type number) -[ 6] -> 6 (type number) +[+NaN] -> nan (type strnum) +[ 6] -> 6 (type strnum) [0x1az] -> 26 (type string) [011Q] -> 9 (type string) -[027] -> 23 (type number) +[027] -> 23 (type strnum) diff --git a/test/rebuild.in b/test/rebuild.in index b2901ea9..2f16a825 100644 --- a/test/rebuild.in +++ b/test/rebuild.in @@ -1 +1 @@ -a b +a 6.3 diff --git a/test/rebuild.ok b/test/rebuild.ok index 29635279..0fe72e23 100644 --- a/test/rebuild.ok +++ b/test/rebuild.ok @@ -1,2 +1,2 @@ -test b +test 6.3 strnum |