aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog133
-rw-r--r--array.c2
-rw-r--r--awk.h69
-rw-r--r--builtin.c27
-rw-r--r--eval.c15
-rw-r--r--field.c203
-rw-r--r--gawkapi.c2
-rw-r--r--gawkapi.h6
-rw-r--r--int_array.c10
-rw-r--r--interpret.h27
-rw-r--r--io.c1
-rw-r--r--mpfr.c8
-rw-r--r--node.c29
-rw-r--r--profile.c1
-rw-r--r--str_array.c8
-rw-r--r--test/ChangeLog13
-rw-r--r--test/Makefile.am4
-rw-r--r--test/Makefile.in9
-rw-r--r--test/Maketests5
-rw-r--r--test/arrayind3.awk19
-rw-r--r--test/arrayind3.ok2
-rw-r--r--test/forcenum.awk6
-rw-r--r--test/forcenum.ok6
-rw-r--r--test/rebuild.in2
-rw-r--r--test/rebuild.ok2
25 files changed, 423 insertions, 186 deletions
diff --git a/ChangeLog b/ChangeLog
index ecd9305a..7d11fd5b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,136 @@
+2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * awk.h: Modify comments to indicate that MAYBE_NUM will now be
+ left enabled to indicate strnum values by the NUMBER|MAYBE_NUM
+ combination, whereas STRING|MAYBE_NUM indicates a potential strnum.
+ (fixtype): Modify MAYBE_NUM test to avoid calling force_number if
+ NUMCUR is already set.
+ * builtin.c (do_typeof): Call fixtype to resolve argument type.
+ This forces parsing of numeric strings, so there's a performance
+ penalty, but we must do this to give a correct result. The meaning
+ of "strnum" changes from "potential strnum" to "actual strnum".
+ * eval.c (set_TEXTDOMAIN): Remove some dead code left over from last
+ patch.
+ * int_array.c (is_integer): When a MAYBE_NUM is converted successfully
+ to a NUMBER, leave the MAYBE_NUM flag enabled.
+ * mpfr.c (mpg_force_number): Ditto.
+ * node.c (r_force_number): Ditto.
+
+2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * awk.h: Modify stptr comment to indicate that all strings are now
+ NUL-terminated.
+ * builtin.c (do_mktime): Remove unnecessary logic to terminate
+ the string with '\0' temporarily.
+ (do_system) Ditto.
+ (nondec2awknum): Add a comment about termination.
+ * eval.c (posix_compare): Remove logic to terminate strings temporarily.
+ (set_ORS): No need to terminate ORS, since the string node is already
+ terminated. What gave us the right to modify that node anyway?
+ (fmt_index): Remove code to terminate string. This seems to have been
+ invalid anyway, since we don't own that memory.
+ (set_TEXTDOMAIN): Do not terminate TEXTDOMAIN string, since the node
+ is already terminated. We didn't have the right to modify that node
+ anyway.
+ * gawkapi.c (node_to_awk_value): Add assert checks to confirm that the
+ string is NUL-terminated.
+ * gawkapi.h: Modify awk_string comment to indicate that strings are
+ always terminated with '\0'.
+ * int_array.c (isinteger): Remove unnecessary logic to terminate string
+ with '\0' temporarily.
+ * interpret.h (Op_push_i): Ditto.
+ * io.c (nextfile): Remove string termination. We didn't own that memory
+ anyway.
+ * mpfr.c (force_mpnum): Remove unnecessary logic to terminate the
+ string with '\0' temporarily.
+ * node.c (r_force_number): Remove NUL termination around strtod call,
+ since we already know that there is either a white space or '\0'
+ character there. Either one will stop strtod.
+ (get_ieee_magic_val): Ditto.
+ * profile.c (pp_number): No need to terminate string returned by
+ r_format_val.
+
+2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * interpret.h (Op_field_spec): Now that all $n field values are
+ NUL-terminated, there is no reason to call dupnode for $n where n > 0.
+ This saves malloc and copying overhead, thereby more than offsetting the
+ performance hit of the additional copying and NUL-termination in the
+ last patch to field.c. It also eliminates repeated parsing in cases
+ where $n, for n > 1, was accessed more than once in a numeric context,
+ so the new approach should be a performance win.
+
+2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ Make sure that all field values, and therefore all strings inside gawk,
+ are terminated with a '\0' character!
+ * field.c (databuf): New static struct to hold info about our buffer to
+ contain the field string values.
+ (allocate_databuf): New function to make sure the databuf is large
+ enough to hold $0 and copies of $1 through $NF.
+ (set_field): Copy $n into free space previously allocated in databuf
+ and add a '\0' at the end.
+ (rebuild_record): Call allocate_databuf to ensure sufficient space
+ for copying non-malloced field values. When copying field values,
+ use databuf to create a NUL-terminated copy.
+ (purge_record): New function extracted from reset_record to initialize
+ $1 through $NF to null values.
+ (set_record): Buffer management moved to new allocate_databuf function.
+ Call purge_record instead of reset_record, since reset_record contains
+ some extra logic not needed in this case.
+ (reset_record): Call purge_record to do most of the work, and call
+ allocate_databuf to make sure we have a big enough buffer to contain
+ copies of the $1 through $NF.
+
+2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * awk.h: Renumber flags to remove gap created when FIELD was removed.
+
+2016-07-05 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * field.c (rebuild_record): Need to set MALLOC flag if we allocate
+ memory for a residual field node with valref > 1.
+
+2016-07-05 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * field.c (rebuild_record): Do not bother to create new field nodes
+ to replace malloc'ed nodes when rebuilding $0.
+
+2016-07-05 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * awk.h (FIELD): Remove unnecessary flag.
+ (MALLOC): Move definition to join the others, and improve the comment.
+ * array.c (value_info): Replace FIELD test with MALLOC test.
+ * eval.c (flags2str): Remove FIELD flag.
+ * field.c (init_fields): Remove FIELD bit from Null_field->flags.
+ (set_field): Remove FIELD bit from flags.
+ (rebuild_record): Test against MALLOC instead of FIELD. If a field
+ node has valref > 1, we should make a copy, although I don't think
+ it is valid for this to happen.
+ (set_record): Remove FIELD bit from flags.
+ * interpret.h (UNFIELD): Add comment, and test MALLOC flag instead of
+ FIELD. Remove probably buggy code to disable the FIELD flag when
+ valref is 1; that would have created a node where neither the FIELD
+ nor MALLOC flag was set, which seems invalid.
+ * node.c (r_dupnode): Remove code disabling FIELD flag.
+
+2016-07-04 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * awk.h (force_string_fmt): New inline function to get the string
+ representation in a requested format.
+ (force_string): Reimplement as a macro using force_string_fmt function.
+ (force_string_ofmt): New macro to get a value's OFMT representation.
+ * builtin.c (do_print): Use new force_string_ofmt macro instead of
+ duplicating the logic inline.
+
+2016-07-04 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * str_array.c (str_lookup): There is no need to worry about the
+ MAYBE_NUM flag, since the code has been patched to make sure to
+ preserve the string value of strnum values, and the integer array
+ code should no longer mistakenly claim a strnum integer with a
+ nonstandard string representation.
+
2016-07-03 Andrew J. Schorr <aschorr@telemetry-investments.com>
* field.c (rebuild_record): Revert warning message regarding flags,
diff --git a/array.c b/array.c
index d9c80a35..be5adfff 100644
--- a/array.c
+++ b/array.c
@@ -696,7 +696,7 @@ value_info(NODE *n)
fprintf(output_fp, ":%s", flags2str(n->flags));
- if ((n->flags & FIELD) == 0)
+ if ((n->flags & MALLOC) != 0)
fprintf(output_fp, ":%ld", n->valref);
else
fprintf(output_fp, ":");
diff --git a/awk.h b/awk.h
index 34755581..9e1fae10 100644
--- a/awk.h
+++ b/awk.h
@@ -387,9 +387,6 @@ typedef struct exp_node {
NODETYPE type;
unsigned int flags;
-/* any type */
-# define MALLOC 0x0001 /* can be free'd */
-
/* type = Node_val */
/*
* STRING and NUMBER are mutually exclusive, except for the special
@@ -409,14 +406,16 @@ typedef struct exp_node {
* b = a + 0 # Adds NUMCUR to a, since numeric value
* # is now available. But the type hasn't changed!
*
- * MAYBE_NUM is the joker. It means "this is string data, but
- * the user may have really wanted it to be a number. If we have
- * to guess, like in a comparison, turn it into a number if the string
- * is indeed numeric."
+ * MAYBE_NUM is the joker. When STRING|MAYBE_NUM is set, it means
+ * "this is string data, but the user may have really wanted it to be a
+ * number. If we have to guess, like in a comparison, turn it into a
+ * number if the string is indeed numeric."
* For example, gawk -v a=42 ....
* Here, `a' gets STRING|STRCUR|MAYBE_NUM and then when used where
* a number is needed, it gets turned into a NUMBER and STRING
- * is cleared.
+ * is cleared. In that case, we leave the MAYBE_NUM in place, so
+ * the combination NUMBER|MAYBE_NUM means it is a strnum a.k.a. a
+ * "numeric string".
*
* WSTRCUR is for efficiency. If in a multibyte locale, and we
* need to do something character based (substr, length, etc.)
@@ -426,29 +425,29 @@ typedef struct exp_node {
*
* We hope that the rest of the flags are self-explanatory. :-)
*/
+# define MALLOC 0x0001 /* stptr can be free'd, i.e. not a field node pointing into a shared buffer */
# define STRING 0x0002 /* assigned as string */
# define STRCUR 0x0004 /* string value is current */
# define NUMCUR 0x0008 /* numeric value is current */
# define NUMBER 0x0010 /* assigned as number */
# define MAYBE_NUM 0x0020 /* user input: if NUMERIC then
* a NUMBER */
-# define FIELD 0x0040 /* this is a field */
-# define INTLSTR 0x0080 /* use localized version */
-# define NUMINT 0x0100 /* numeric value is an integer */
-# define INTIND 0x0200 /* integral value is array index;
+# define INTLSTR 0x0040 /* use localized version */
+# define NUMINT 0x0080 /* numeric value is an integer */
+# define INTIND 0x0100 /* integral value is array index;
* lazy conversion to string.
*/
-# define WSTRCUR 0x0400 /* wide str value is current */
-# define MPFN 0x0800 /* arbitrary-precision floating-point number */
-# define MPZN 0x1000 /* arbitrary-precision integer */
-# define NO_EXT_SET 0x2000 /* extension cannot set a value for this variable */
-# define NULL_FIELD 0x4000 /* this is the null field */
+# define WSTRCUR 0x0200 /* wide str value is current */
+# define MPFN 0x0400 /* arbitrary-precision floating-point number */
+# define MPZN 0x0800 /* arbitrary-precision integer */
+# define NO_EXT_SET 0x1000 /* extension cannot set a value for this variable */
+# define NULL_FIELD 0x2000 /* this is the null field */
/* type = Node_var_array */
-# define ARRAYMAXED 0x8000 /* array is at max size */
-# define HALFHAT 0x10000 /* half-capacity Hashed Array Tree;
+# define ARRAYMAXED 0x4000 /* array is at max size */
+# define HALFHAT 0x8000 /* half-capacity Hashed Array Tree;
* See cint_array.c */
-# define XARRAY 0x20000
+# define XARRAY 0x10000
} NODE;
#define vname sub.nodep.name
@@ -476,13 +475,7 @@ typedef struct exp_node {
#define re_cnt flags
/* Node_val */
-/*
- * Note that the string in stptr may not be NUL-terminated, but it is
- * guaranteed to have at least one extra byte that may be temporarily set
- * to '\0'. This is helpful when calling functions such as strtod that require
- * a NUL-terminated argument. In particular, field values $n for n > 0 and
- * n < NF will not have a NUL terminator, since they point into the $0 buffer.
- */
+/* Note that the string in stptr will always be NUL-terminated. */
#define stptr sub.val.sp
#define stlen sub.val.slen
#define valref sub.val.sref
@@ -1799,21 +1792,33 @@ dupnode(NODE *n)
}
#endif
-/* force_string --- force a node to have a string value */
+/*
+ * force_string_fmt --- force a node to have a string value in a given format.
+ * The string representation of a number may change due to whether it was most
+ * recently rendered with CONVFMT or OFMT, or due to changes in the CONVFMT
+ * and OFMT values. But if the value entered gawk as a string or strnum, then
+ * stfmt should be set to STFMT_UNUSED, and the string representation should
+ * not change.
+ */
static inline NODE *
-force_string(NODE *s)
+force_string_fmt(NODE *s, const char *fmtstr, int fmtidx)
{
if (s->type == Node_typedregex)
return dupnode(s->re_exp);
if ((s->flags & STRCUR) != 0
- && (s->stfmt == STFMT_UNUSED || s->stfmt == CONVFMTidx)
+ && (s->stfmt == STFMT_UNUSED || s->stfmt == fmtidx)
)
return s;
- return format_val(CONVFMT, CONVFMTidx, s);
+ return format_val(fmtstr, fmtidx, s);
}
+/* conceptually should be force_string_convfmt, but this is the typical case */
+#define force_string(s) force_string_fmt((s), CONVFMT, CONVFMTidx)
+
+#define force_string_ofmt(s) force_string_fmt((s), OFMT, OFMTidx)
+
#ifdef GAWKDEBUG
#define unref r_unref
#define force_number str2number
@@ -1862,7 +1867,7 @@ fixtype(NODE *n)
{
assert(n->type == Node_val || n->type == Node_typedregex);
if (n->type == Node_val) {
- if ((n->flags & MAYBE_NUM) != 0)
+ if ((n->flags & (NUMCUR|MAYBE_NUM)) == MAYBE_NUM)
return force_number(n);
if ((n->flags & INTIND) != 0)
return force_string(n);
diff --git a/builtin.c b/builtin.c
index 92ac9e49..032f0ec7 100644
--- a/builtin.c
+++ b/builtin.c
@@ -2035,16 +2035,12 @@ do_mktime(int nargs)
int month, day, hour, minute, second, count;
int dst = -1; /* default is unknown */
time_t then_stamp;
- char save;
t1 = POP_SCALAR();
if (do_lint && (fixtype(t1)->flags & STRING) == 0)
lintwarn(_("mktime: received non-string argument"));
t1 = force_string(t1);
- save = t1->stptr[t1->stlen];
- t1->stptr[t1->stlen] = '\0';
-
count = sscanf(t1->stptr, "%ld %d %d %d %d %d %d",
& year, & month, & day,
& hour, & minute, & second,
@@ -2058,7 +2054,6 @@ do_mktime(int nargs)
|| (month < 1 || month > 12) ))
lintwarn(_("mktime: at least one of the values is out of the default range"));
- t1->stptr[t1->stlen] = save;
DEREF(t1);
if (count < 6
@@ -2088,7 +2083,6 @@ do_system(int nargs)
NODE *tmp;
AWKNUM ret = 0; /* floating point on purpose, compat Unix awk */
char *cmd;
- char save;
int status;
if (do_sandbox)
@@ -2101,10 +2095,6 @@ do_system(int nargs)
cmd = force_string(tmp)->stptr;
if (cmd && *cmd) {
- /* insure arg to system is zero-terminated */
- save = cmd[tmp->stlen];
- cmd[tmp->stlen] = '\0';
-
os_restore_mode(fileno(stdin));
#ifdef SIGPIPE
signal(SIGPIPE, SIG_DFL);
@@ -2148,7 +2138,6 @@ do_system(int nargs)
signal(SIGPIPE, SIG_IGN);
#endif
- cmd[tmp->stlen] = save;
}
DEREF(tmp);
return make_number((AWKNUM) ret);
@@ -2200,12 +2189,7 @@ do_print(int nargs, int redirtype)
DEREF(args_array[i]);
fatal(_("attempt to use array `%s' in a scalar context"), array_vname(tmp));
}
-
- if (tmp->type == Node_typedregex)
- args_array[i] = force_string(tmp);
- else if (!((tmp->flags & STRCUR) != 0
- && (tmp->stfmt == STFMT_UNUSED || tmp->stfmt == OFMTidx)))
- args_array[i] = format_val(OFMT, OFMTidx, tmp);
+ args_array[i] = force_string_ofmt(tmp);
}
if (redir_exp != NULL) {
@@ -3637,6 +3621,11 @@ nondec2awknum(char *str, size_t len, char **endptr)
*endptr = str;
} else {
decimal:
+ /*
+ * Terminating is probably unnecessary, since the caller always
+ * passes a string ending with '\0' or white space, but it
+ * seems safest to leave this to avoid future problems.
+ */
save = str[len];
str[len] = '\0';
retval = strtod(str, endptr);
@@ -3962,14 +3951,14 @@ do_typeof(int nargs)
break;
case Node_val:
case Node_var:
- switch (arg->flags & (STRING|NUMBER|MAYBE_NUM)) {
+ switch (fixtype(arg)->flags & (STRING|NUMBER|MAYBE_NUM)) {
case STRING:
res = "string";
break;
case NUMBER:
res = "number";
break;
- case STRING|MAYBE_NUM:
+ case NUMBER|MAYBE_NUM:
res = "strnum";
break;
case NUMBER|STRING:
diff --git a/eval.c b/eval.c
index 5b4418b8..cfb1d1e6 100644
--- a/eval.c
+++ b/eval.c
@@ -435,7 +435,6 @@ flags2str(int flagval)
{ NUMCUR, "NUMCUR" },
{ NUMBER, "NUMBER" },
{ MAYBE_NUM, "MAYBE_NUM" },
- { FIELD, "FIELD" },
{ INTLSTR, "INTLSTR" },
{ NUMINT, "NUMINT" },
{ INTIND, "INTIND" },
@@ -494,15 +493,8 @@ static int
posix_compare(NODE *s1, NODE *s2)
{
int ret = 0;
- char save1, save2;
size_t l = 0;
- save1 = s1->stptr[s1->stlen];
- s1->stptr[s1->stlen] = '\0';
-
- save2 = s2->stptr[s2->stlen];
- s2->stptr[s2->stlen] = '\0';
-
if (gawk_mb_cur_max == 1) {
if (strlen(s1->stptr) == s1->stlen && strlen(s2->stptr) == s2->stlen)
ret = strcoll(s1->stptr, s2->stptr);
@@ -564,8 +556,6 @@ posix_compare(NODE *s1, NODE *s2)
}
#endif
- s1->stptr[s1->stlen] = save1;
- s2->stptr[s2->stlen] = save2;
return ret;
}
@@ -825,7 +815,6 @@ set_ORS()
ORS_node->var_value = force_string(ORS_node->var_value);
ORS = ORS_node->var_value->stptr;
ORSlen = ORS_node->var_value->stlen;
- ORS[ORSlen] = '\0';
}
/* fmt_ok --- is the conversion format a valid one? */
@@ -888,7 +877,6 @@ fmt_index(NODE *n)
ix++;
}
/* not found */
- n->stptr[n->stlen] = '\0';
if (do_lint && ! fmt_ok(n))
lintwarn(_("bad `%sFMT' specification `%s'"),
n == CONVFMT_node->var_value ? "CONV"
@@ -967,13 +955,10 @@ set_LINT()
void
set_TEXTDOMAIN()
{
- int len;
NODE *tmp;
tmp = TEXTDOMAIN_node->var_value = force_string(TEXTDOMAIN_node->var_value);
TEXTDOMAIN = tmp->stptr;
- len = tmp->stlen;
- TEXTDOMAIN[len] = '\0';
/*
* Note: don't call textdomain(); this value is for
* the awk program, not for gawk itself.
diff --git a/field.c b/field.c
index 892818f7..3307aced 100644
--- a/field.c
+++ b/field.c
@@ -55,6 +55,8 @@ static long fpat_parse_field(long, char **, int, NODE *,
static void set_element(long num, char * str, long len, NODE *arr);
static void grow_fields_arr(long num);
static void set_field(long num, char *str, long len, NODE *dummy);
+static void purge_record(void);
+static void allocate_databuf(size_t, bool);
static char *parse_extent; /* marks where to restart parse of record */
static long parse_high_water = 0; /* field number that we have parsed so far */
@@ -93,7 +95,7 @@ init_fields()
getnode(Null_field);
*Null_field = *Nnull_string;
Null_field->valref = 1;
- Null_field->flags = (FIELD|STRCUR|STRING|NULL_FIELD);
+ Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */
field0_valid = true;
}
@@ -115,6 +117,15 @@ grow_fields_arr(long num)
nf_high_water = num;
}
+static struct {
+ char *p; /* buffer for $0 and field copies */
+ size_t size; /* buffer size */
+ char *space; /*
+ * Pointer to free space in databuf.p for making
+ * NUL-terminated copies of $1 thru $NF
+ */
+} databuf;
+
/* set_field --- set the value of a particular field */
/*ARGSUSED*/
@@ -129,9 +140,20 @@ set_field(long num,
if (num > nf_high_water)
grow_fields_arr(num);
n = fields_arr[num];
- n->stptr = str;
+ /*
+ * Make a NUL-terminated copy. It is tempting to do this only if
+ * str[len] != '\0', but the parse methods cannot be relied upon to
+ * avoid altering the contents of the record during parsing. For
+ * example, def_parse_field changes the final NUL to a space. In
+ * principle, the method could change other characters, so it does
+ * not seem safe to rely upon the value of str[len].
+ */
+ memcpy(databuf.space, str, len);
+ databuf.space[len] = '\0';
+ n->stptr = databuf.space;
+ databuf.space += len+1;
n->stlen = len;
- n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
+ n->flags = (STRCUR|STRING|MAYBE_NUM); /* do not set MALLOC */
}
/* rebuild_record --- Someone assigned a value to $(something).
@@ -185,6 +207,8 @@ rebuild_record()
}
}
tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
+ allocate_databuf(tlen, false);
+ databuf.space = databuf.p;
/*
* Since we are about to unref fields_arr[0], we want to find
@@ -194,32 +218,44 @@ rebuild_record()
*/
for (cops = ops, i = 1; i <= NF; i++) {
NODE *r = fields_arr[i];
- if (r->stlen > 0) {
+ /*
+ * There is no reason to copy malloc'ed fields to point into
+ * the new $0 buffer, although that's how previous versions did
+ * it. It seems faster to leave the malloc'ed fields in place.
+ */
+ if ((r->flags & MALLOC) == 0) {
NODE *n;
getnode(n);
- if ((r->flags & FIELD) == 0) {
- *n = *Null_field;
- n->stlen = r->stlen;
- if ((r->flags & (NUMCUR|NUMBER)) != 0) {
- n->flags |= (r->flags & (MPFN|MPZN|NUMCUR|NUMBER));
-#ifdef HAVE_MPFR
- if (is_mpg_float(r)) {
- mpfr_init(n->mpg_numbr);
- mpfr_set(n->mpg_numbr, r->mpg_numbr, ROUND_MODE);
- } else if (is_mpg_integer(r)) {
- mpz_init(n->mpg_i);
- mpz_set(n->mpg_i, r->mpg_i);
- } else
-#endif
- n->numbr = r->numbr;
- }
- } else {
- *n = *r;
- n->flags &= ~MALLOC;
+ *n = *r;
+ if (r->valref > 1) {
+ /*
+ * This probably never happens, since it
+ * was not considered by previous versions of
+ * this function. But it seems clear that
+ * we can't leave r's stptr pointing into the
+ * old $0 buffer that we are about to unref.
+ * It's not a priori obvious that valref must be
+ * 1 in all cases, so it seems wise to suppport
+ * this corner case. The only question is
+ * whether to add a warning message.
+ */
+ emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record");
+ memcpy(r->stptr, cops, r->stlen);
+ r->stptr[r->stlen] = '\0';
+ r->flags |= MALLOC;
}
- n->stptr = cops;
+ if (cops[n->stlen] == '\0')
+ /* should be the case for $NF */
+ n->stptr = cops;
+ else {
+ /* make a NUL-terminated copy */
+ memcpy(databuf.space, cops, n->stlen);
+ databuf.space[n->stlen] = '\0';
+ n->stptr = databuf.space;
+ databuf.space += n->stlen+1;
+ }
unref(r);
fields_arr[i] = n;
assert((n->flags & WSTRCUR) == 0);
@@ -227,12 +263,67 @@ rebuild_record()
cops += fields_arr[i]->stlen + OFSlen;
}
+#ifndef NDEBUG
+ if ((fields_arr[0]->flags & MALLOC) == 0)
+ assert(fields_arr[0]->valref == 1);
+#endif
unref(fields_arr[0]);
fields_arr[0] = tmp;
field0_valid = true;
}
+static void
+allocate_databuf(size_t reclen, bool need_zero)
+{
+ size_t needed;
+#define INITIAL_SIZE 512
+#define MAX_SIZE ((size_t) ~0) /* maximally portable ... */
+
+ /* buffer management: */
+ if (databuf.size == 0) { /* first time */
+ emalloc(databuf.p, char *, INITIAL_SIZE, "set_record");
+ databuf.size = INITIAL_SIZE;
+
+ }
+ /*
+ * Make sure there's enough room. We need space for $0 plus a NUL
+ * terminator plus room for NUL-terminated copies of $1 through $NF.
+ * We use reclen as an upper bound for NF, assuming at least 1 byte
+ * for a field and its field separator (or fixed-width column). So our
+ * total requirement is reclen + 1 + 2*reclen -> 3*reclen + 1.
+ * It is tempting to skip the copy if the field value is already
+ * terminated with a NUL; this should normally be the case for $NF.
+ * Unfortunately, the parse methods often alter the string while
+ * parsing, typically changing the final NUL to a sentinel. So when
+ * set_field is called, the value of the character after the string
+ * in question may not be the actual value once parsing is complete.
+ * To be safe, it is prudent to copy all of the fields.
+ */
+ needed = 2*reclen; /* for copying $1..$NF */
+ if (need_zero)
+ needed += reclen + 1; /* for $0 plus '\0' */
+#ifdef GAWKDEBUG
+ /* malloc precise size so we can check for overruns with valgrind */
+ if (needed == 0)
+ needed = 1; /* erealloc requires non-zero bytes */
+ databuf.size = needed;
+ erealloc(databuf.p, char *, databuf.size, "set_record");
+#else
+ if (needed > databuf.size) {
+ do {
+ if (databuf.size > MAX_SIZE/2)
+ fatal(_("input record too large"));
+ databuf.size *= 2;
+ } while (needed > databuf.size);
+ erealloc(databuf.p, char *, databuf.size, "set_record");
+ }
+#endif
+
+#undef INITIAL_SIZE
+#undef MAX_SIZE
+}
+
/*
* set_record:
* setup $0, but defer parsing rest of line until reference is made to $(>0)
@@ -247,53 +338,34 @@ void
set_record(const char *buf, int cnt)
{
NODE *n;
- static char *databuf;
- static unsigned long databuf_size;
-#define INITIAL_SIZE 512
-#define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */
-
- reset_record();
- /* buffer management: */
- if (databuf_size == 0) { /* first time */
- emalloc(databuf, char *, INITIAL_SIZE, "set_record");
- databuf_size = INITIAL_SIZE;
- memset(databuf, '\0', INITIAL_SIZE);
+ purge_record();
+ allocate_databuf(cnt, true);
- }
- /*
- * Make sure there's enough room. Since we sometimes need
- * to place a sentinel at the end, we make sure
- * databuf_size is > cnt after allocation.
- */
- if (cnt >= databuf_size) {
- while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
- databuf_size *= 2;
- erealloc(databuf, char *, databuf_size, "set_record");
- memset(databuf, '\0', databuf_size);
- }
/* copy the data */
- memcpy(databuf, buf, cnt);
+ memcpy(databuf.p, buf, cnt);
/*
* Add terminating '\0' so that C library routines
* will know when to stop.
*/
- databuf[cnt] = '\0';
+ databuf.p[cnt] = '\0';
+ databuf.space = databuf.p + cnt + 1;
/* manage field 0: */
+#ifndef NDEBUG
+ if ((fields_arr[0]->flags & MALLOC) == 0)
+ assert(fields_arr[0]->valref == 1);
+#endif
unref(fields_arr[0]);
getnode(n);
- n->stptr = databuf;
+ n->stptr = databuf.p;
n->stlen = cnt;
n->valref = 1;
n->type = Node_val;
n->stfmt = STFMT_UNUSED;
- n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
+ n->flags = (STRING|STRCUR|MAYBE_NUM); /* do not set MALLOC */
fields_arr[0] = n;
-
-#undef INITIAL_SIZE
-#undef MAX_SIZE
}
/* reset_record --- start over again with current $0 */
@@ -301,13 +373,32 @@ set_record(const char *buf, int cnt)
void
reset_record()
{
+ fields_arr[0] = force_string(fields_arr[0]);
+ purge_record();
+ if ((fields_arr[0]->flags & MALLOC) != 0) {
+ allocate_databuf(fields_arr[0]->stlen, false);
+ databuf.space = databuf.p;
+ }
+ else {
+ allocate_databuf(fields_arr[0]->stlen, true);
+ /* may have been realloced, so set stptr */
+ fields_arr[0]->stptr = databuf.p;
+ databuf.space = databuf.p + fields_arr[0]->stlen + 1;
+ }
+}
+
+static void
+purge_record()
+{
int i;
NODE *n;
- fields_arr[0] = force_string(fields_arr[0]);
-
NF = -1;
for (i = 1; i <= parse_high_water; i++) {
+#ifndef NDEBUG
+ if ((fields_arr[i]->flags & MALLOC) == 0)
+ assert(fields_arr[i]->valref == 1);
+#endif
unref(fields_arr[i]);
getnode(n);
*n = *Null_field;
diff --git a/gawkapi.c b/gawkapi.c
index df69012b..afefa4f6 100644
--- a/gawkapi.c
+++ b/gawkapi.c
@@ -440,6 +440,7 @@ node_to_awk_value(NODE *node, awk_value_t *val, awk_valtype_t wanted)
(void) force_string(node);
val->str_value.str = node->stptr;
val->str_value.len = node->stlen;
+ assert(val->str_value.str[val->str_value.len] == '\0');
ret = awk_true;
break;
@@ -468,6 +469,7 @@ node_to_awk_value(NODE *node, awk_value_t *val, awk_valtype_t wanted)
val->val_type = AWK_STRING;
val->str_value.str = node->stptr;
val->str_value.len = node->stlen;
+ assert(val->str_value.str[val->str_value.len] == '\0');
ret = awk_true;
} else
val->val_type = AWK_UNDEFINED;
diff --git a/gawkapi.h b/gawkapi.h
index 975f82df..10eab1cf 100644
--- a/gawkapi.h
+++ b/gawkapi.h
@@ -279,11 +279,7 @@ enum {
* be multibyte encoded in the current locale's encoding and character
* set. Gawk will convert internally to wide characters if necessary.
*
- * Note that the string may not be terminated with a '\0' character.
- * In particular, this happens for field values $n where n > 0 and n < NF,
- * since the string points directly into the $0 buffer. All other strings,
- * including those created by extensions, should be NUL-terminated. In general
- * though, extension code should not assume that the string is NUL-terminated!
+ * Note that the string will always be terminated with a '\0' character.
*/
typedef struct awk_string {
char *str; /* data */
diff --git a/int_array.c b/int_array.c
index e7913dea..937a91cf 100644
--- a/int_array.c
+++ b/int_array.c
@@ -128,7 +128,6 @@ is_integer(NODE *symbol, NODE *subs)
/* must be a STRING */
char *cp = subs->stptr, *cpend, *ptr;
- char save;
size_t len = subs->stlen;
if (len == 0 || (! isdigit((unsigned char) *cp) && *cp != '-'))
@@ -143,7 +142,8 @@ is_integer(NODE *symbol, NODE *subs)
if (len == 1 && *cp != '-') { /* single digit */
subs->numbr = (long) (*cp - '0');
if ((subs->flags & MAYBE_NUM) != 0) {
- subs->flags &= ~(MAYBE_NUM|STRING);
+ /* leave MAYBE_NUM set */
+ subs->flags &= ~STRING;
subs->flags |= NUMBER;
}
subs->flags |= (NUMCUR|NUMINT);
@@ -151,18 +151,16 @@ is_integer(NODE *symbol, NODE *subs)
}
cpend = cp + len;
- save = *cpend;
- *cpend = '\0';
errno = 0;
l = strtol(cp, & ptr, 10);
- *cpend = save;
if (errno != 0 || ptr != cpend)
return NULL;
subs->numbr = l;
if ((subs->flags & MAYBE_NUM) != 0) {
- subs->flags &= ~(MAYBE_NUM|STRING);
+ /* leave MAYBE_NUM set */
+ subs->flags &= ~STRING;
subs->flags |= NUMBER;
}
subs->flags |= NUMCUR;
diff --git a/interpret.h b/interpret.h
index 3bb4532e..9d7b423e 100644
--- a/interpret.h
+++ b/interpret.h
@@ -23,13 +23,19 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
+/*
+ * If "r" is a field, valref should normally be > 1, because the field is
+ * created initially with valref 1, and valref should be bumped when it is
+ * pushed onto the stack by Op_field_spec. On the other hand, if we are
+ * assigning to $n, then Op_store_field calls unref(*lhs) before assigning
+ * the new value, so that decrements valref. So if the RHS is a field with
+ * valref 1, that effectively means that this is an assignment like "$n = $n",
+ * so a no-op, other than triggering $0 reconstitution.
+ */
#define UNFIELD(l, r) \
{ \
/* if was a field, turn it into a var */ \
- if ((r->flags & FIELD) == 0) { \
- l = r; \
- } else if (r->valref == 1) { \
- r->flags &= ~FIELD; \
+ if ((r->flags & MALLOC) != 0 || r->valref == 1) { \
l = r; \
} else { \
l = dupnode(r); \
@@ -129,13 +135,10 @@ top:
case Op_push_i:
m = pc->memory;
if (! do_traditional && (m->flags & INTLSTR) != 0) {
- char *orig, *trans, save;
+ char *orig, *trans;
- save = m->stptr[m->stlen];
- m->stptr[m->stlen] = '\0';
orig = m->stptr;
trans = dgettext(TEXTDOMAIN, orig);
- m->stptr[m->stlen] = save;
m = make_string(trans, strlen(trans));
} else
UPREF(m);
@@ -357,12 +360,8 @@ uninitialized_scalar:
lhs = r_get_field(t1, (Func_ptr *) 0, true);
decr_sp();
DEREF(t1);
- /* only for $0, up ref count */
- if (*lhs == fields_arr[0]) {
- r = *lhs;
- UPREF(r);
- } else
- r = dupnode(*lhs);
+ r = *lhs;
+ UPREF(r);
PUSH(r);
break;
diff --git a/io.c b/io.c
index b9bce694..5ad7913a 100644
--- a/io.c
+++ b/io.c
@@ -480,7 +480,6 @@ nextfile(IOBUF **curfile, bool skipping)
if (arg == NULL || arg->stlen == 0)
continue;
arg = force_string(arg);
- arg->stptr[arg->stlen] = '\0';
if (! do_traditional) {
unref(ARGIND_node->var_value);
ARGIND_node->var_value = make_number((AWKNUM) i);
diff --git a/mpfr.c b/mpfr.c
index 0bb5b435..b239e4b6 100644
--- a/mpfr.c
+++ b/mpfr.c
@@ -275,7 +275,6 @@ static int
force_mpnum(NODE *n, int do_nondec, int use_locale)
{
char *cp, *cpend, *ptr, *cp1;
- char save;
int tval, base = 10;
if (n->stlen == 0) {
@@ -292,9 +291,6 @@ force_mpnum(NODE *n, int do_nondec, int use_locale)
return false;
}
- save = *cpend;
- *cpend = '\0';
-
if (*cp == '+' || *cp == '-')
cp1 = cp + 1;
else
@@ -329,7 +325,6 @@ done:
/* trailing space is OK for NUMBER */
while (ptr < cpend && isspace((unsigned char) *ptr))
ptr++;
- *cpend = save;
if (errno == 0 && ptr == cpend)
return true;
errno = 0;
@@ -347,7 +342,8 @@ mpg_force_number(NODE *n)
if (force_mpnum(n, (do_non_decimal_data && ! do_traditional), true)) {
if ((n->flags & MAYBE_NUM) != 0) {
- n->flags &= ~(MAYBE_NUM|STRING);
+ /* leave MAYBE_NUM set to indicate a strnum */
+ n->flags &= ~STRING;
n->flags |= NUMBER;
}
} else
diff --git a/node.c b/node.c
index bb2fe437..37aa9463 100644
--- a/node.c
+++ b/node.c
@@ -59,7 +59,6 @@ r_force_number(NODE *n)
{
char *cp;
char *cpend;
- char save;
char *ptr;
extern double strtod();
@@ -67,9 +66,9 @@ r_force_number(NODE *n)
return n;
/*
- * We should always set NUMCUR and clear MAYBE_NUM, and we may possibly
- * change STRING to NUMBER if MAYBE_NUM was set and it's a good numeric
- * string.
+ * We should always set NUMCUR. If MAYBE_NUM is set and it's a
+ * numeric string, we clear STRING and enable NUMBER, but if it's not
+ * numeric, we disable MAYBE_NUM.
*/
/* All the conditionals are an attempt to avoid the expensive strtod */
@@ -133,10 +132,13 @@ r_force_number(NODE *n)
/* nondec2awknum() saves and restores the byte after the string itself */
n->numbr = nondec2awknum(cp, cpend - cp, &ptr);
} else {
- save = *cpend;
- *cpend = '\0';
+ /*
+ * There is no need to set *cpend to '\0' because it is either
+ * pointing to white space or the '\0' at the end of the string.
+ * In either case, strtod should terminate on that character
+ * or earlier due to non-numeric characters.
+ */
n->numbr = (AWKNUM) strtod((const char *) cp, &ptr);
- *cpend = save;
}
if (errno == 0) {
@@ -164,7 +166,8 @@ badnum:
goodnum:
if ((n->flags & MAYBE_NUM) != 0) {
- n->flags &= ~(MAYBE_NUM|STRING);
+ /* leave MAYBE_NUM enabled to indicate that this is a strnum */
+ n->flags &= ~STRING;
n->flags |= NUMBER;
}
return n;
@@ -300,7 +303,6 @@ r_dupnode(NODE *n)
getnode(r);
*r = *n;
- r->flags &= ~FIELD;
r->flags |= MALLOC;
r->valref = 1;
/*
@@ -942,13 +944,14 @@ get_ieee_magic_val(char *val)
static bool first = true;
static AWKNUM inf;
static AWKNUM nan;
- char save;
char *ptr;
- save = val[4];
- val[4] = '\0';
+ /*
+ * There is no need to set val[4] to '\0' because it is either white
+ * space or the NUL character at the end of the string. Either way,
+ * strtod should terminate on that character.
+ */
AWKNUM v = strtod(val, &ptr);
- val[4] = save;
if (val == ptr) { /* Older strtod implementations don't support inf or nan. */
if (first) {
diff --git a/profile.c b/profile.c
index be8977e8..aa8a152b 100644
--- a/profile.c
+++ b/profile.c
@@ -1541,7 +1541,6 @@ pp_number(NODE *n)
s = r_format_val("%.6g", 0, s);
- s->stptr[s->stlen] = '\0';
str = s->stptr;
freenode(s);
diff --git a/str_array.c b/str_array.c
index f66b22cc..e8ce973e 100644
--- a/str_array.c
+++ b/str_array.c
@@ -164,11 +164,9 @@ str_lookup(NODE *symbol, NODE *subs)
* "Array indices are always strings."
* "Array indices are always strings."
* ....
- * If subs is a STRNUM, copy it; don't clear the MAYBE_NUM
- * flag on it since other variables could be using the same
- * reference-counted value.
*/
- if (subs->stfmt != STFMT_UNUSED || (subs->flags & MAYBE_NUM) != 0) {
+ if (subs->stfmt != STFMT_UNUSED) {
+ /* The string was generated using CONVFMT. */
NODE *tmp;
/*
@@ -199,8 +197,6 @@ str_lookup(NODE *symbol, NODE *subs)
subs = dupnode(subs);
}
- assert((subs->flags & MAYBE_NUM) == 0);
-
getbucket(b);
b->ahnext = symbol->buckets[hash1];
symbol->buckets[hash1] = b;
diff --git a/test/ChangeLog b/test/ChangeLog
index 8b4d70e8..6821488c 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,16 @@
+2016-07-06 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * forcenum.awk: We no longer need to force the strnum conversion,
+ since typeof now does this automatically.
+ * forcenum.ok: Change "number" to "strnum" for the numeric strings.
+ * rebuild.in: Change input to include a strnum.
+ * rebuild.ok: Update results.
+
+2016-07-04 Andrew J. Schorr <aschorr@telemetry-investments.com>
+
+ * Makefile.am (arrayind3): New test.
+ * arrayind3.awk, arrayind3.ok: New files.
+
2016-07-03 Andrew J. Schorr <aschorr@telemetry-investments.com>
* Makefile.am (rebuild): New test.
diff --git a/test/Makefile.am b/test/Makefile.am
index 3a51b3ff..a13a01c9 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -57,6 +57,8 @@ EXTRA_DIST = \
arrayind1.ok \
arrayind2.awk \
arrayind2.ok \
+ arrayind3.awk \
+ arrayind3.ok \
arrayparm.awk \
arrayparm.ok \
arrayprm2.awk \
@@ -1137,7 +1139,7 @@ CLEANFILES = core core.* fmtspcl.ok
# try to keep these sorted. each letter starts a new line
BASIC_TESTS = \
- addcomma anchgsub argarray arrayind1 arrayind2 arrayparm arrayprm2 arrayprm3 \
+ addcomma anchgsub argarray arrayind1 arrayind2 arrayind3 arrayparm arrayprm2 arrayprm3 \
arrayref arrymem1 arryref2 arryref3 arryref4 arryref5 arynasty \
arynocls aryprm1 aryprm2 aryprm3 aryprm4 aryprm5 aryprm6 aryprm7 \
aryprm8 aryprm9 arysubnm asgext awkpath \
diff --git a/test/Makefile.in b/test/Makefile.in
index 29064326..6db4c34c 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -314,6 +314,8 @@ EXTRA_DIST = \
arrayind1.ok \
arrayind2.awk \
arrayind2.ok \
+ arrayind3.awk \
+ arrayind3.ok \
arrayparm.awk \
arrayparm.ok \
arrayprm2.awk \
@@ -1393,7 +1395,7 @@ CLEANFILES = core core.* fmtspcl.ok
# try to keep these sorted. each letter starts a new line
BASIC_TESTS = \
- addcomma anchgsub argarray arrayind1 arrayind2 arrayparm arrayprm2 arrayprm3 \
+ addcomma anchgsub argarray arrayind1 arrayind2 arrayind3 arrayparm arrayprm2 arrayprm3 \
arrayref arrymem1 arryref2 arryref3 arryref4 arryref5 arynasty \
arynocls aryprm1 aryprm2 aryprm3 aryprm4 aryprm5 aryprm6 aryprm7 \
aryprm8 aryprm9 arysubnm asgext awkpath \
@@ -2775,6 +2777,11 @@ arrayind2:
@AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+arrayind3:
+ @echo $@
+ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
arrayparm:
@echo $@
@AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/Maketests b/test/Maketests
index 525a44d1..2aafd2bb 100644
--- a/test/Maketests
+++ b/test/Maketests
@@ -20,6 +20,11 @@ arrayind2:
@AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+arrayind3:
+ @echo $@
+ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
arrayparm:
@echo $@
@AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/arrayind3.awk b/test/arrayind3.awk
new file mode 100644
index 00000000..ca4c58b5
--- /dev/null
+++ b/test/arrayind3.awk
@@ -0,0 +1,19 @@
+BEGIN {
+ # initialize cint arrays
+ pos[0] = 0
+ posout[0] = 0
+ split("00000779770060", f) # f[1] is a strnum
+ pos[f[1]] = 1 # subscripts must be strings!
+ for (x in pos) {
+ # if x is a strnum, then the
+ # x != 0 test may convert it to an integral NUMBER,
+ # and we might lose the unusual string representation
+ # if the cint code is not careful to recognize that this is
+ # actually a string
+ if (x != 0)
+ posout[x] = pos[x]
+ }
+ # which array element is populated?
+ print posout[779770060]
+ print posout["00000779770060"]
+}
diff --git a/test/arrayind3.ok b/test/arrayind3.ok
new file mode 100644
index 00000000..a464d9da
--- /dev/null
+++ b/test/arrayind3.ok
@@ -0,0 +1,2 @@
+
+1
diff --git a/test/forcenum.awk b/test/forcenum.awk
index 54c536c9..1a7ddce7 100644
--- a/test/forcenum.awk
+++ b/test/forcenum.awk
@@ -1,8 +1,6 @@
BEGIN {
- # first, make some strnums
+ # make some strnums
nf = split("|5apple|+NaN| 6|0x1az|011Q|027", f, "|")
- for (i = 1; i <= nf; i++) {
- x = f[i]+0 # trigger strnum conversion to number or string
+ for (i = 1; i <= nf; i++)
printf "[%s] -> %g (type %s)\n", f[i], f[i], typeof(f[i])
- }
}
diff --git a/test/forcenum.ok b/test/forcenum.ok
index c74eefc7..a379db62 100644
--- a/test/forcenum.ok
+++ b/test/forcenum.ok
@@ -1,7 +1,7 @@
[] -> 0 (type string)
[5apple] -> 5 (type string)
-[+NaN] -> nan (type number)
-[ 6] -> 6 (type number)
+[+NaN] -> nan (type strnum)
+[ 6] -> 6 (type strnum)
[0x1az] -> 26 (type string)
[011Q] -> 9 (type string)
-[027] -> 23 (type number)
+[027] -> 23 (type strnum)
diff --git a/test/rebuild.in b/test/rebuild.in
index b2901ea9..2f16a825 100644
--- a/test/rebuild.in
+++ b/test/rebuild.in
@@ -1 +1 @@
-a b
+a 6.3
diff --git a/test/rebuild.ok b/test/rebuild.ok
index 29635279..0fe72e23 100644
--- a/test/rebuild.ok
+++ b/test/rebuild.ok
@@ -1,2 +1,2 @@
-test b
+test 6.3
strnum