1 files changed, 87 insertions, 135 deletions
diff --git a/field.c b/field.c
index 5f5b2b65..0799fb1b 100644
--- a/field.c
+++ b/field.c
@@ -2,22 +2,22 @@
  * field.c - routines for dealing with fields and record parsing
  */
 
-/* 
+/*
  * Copyright (C) 1986, 1988, 1989, 1991-2016 the Free Software Foundation, Inc.
- * 
+ *
  * This file is part of GAWK, the GNU implementation of the
  * AWK Programming Language.
- * 
+ *
  * GAWK is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * GAWK is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
@@ -44,8 +44,6 @@ static long re_parse_field(long, char **, int, NODE *,
 			     Regexp *, Setfunc, NODE *, NODE *, bool);
 static long def_parse_field(long, char **, int, NODE *,
 			      Regexp *, Setfunc, NODE *, NODE *, bool);
-static long posix_def_parse_field(long, char **, int, NODE *,
-			      Regexp *, Setfunc, NODE *, NODE *, bool);
 static long null_parse_field(long, char **, int, NODE *,
 			     Regexp *, Setfunc, NODE *, NODE *, bool);
 static long sc_parse_field(long, char **, int, NODE *,
@@ -57,6 +55,7 @@ static long fpat_parse_field(long, char **, int, NODE *,
 static void set_element(long num, char * str, long len, NODE *arr);
 static void grow_fields_arr(long num);
 static void set_field(long num, char *str, long len, NODE *dummy);
+static void purge_record(void);
 
 static char *parse_extent;	/* marks where to restart parse of record */
 static long parse_high_water = 0; /* field number that we have parsed so far */
@@ -95,7 +94,7 @@ init_fields()
 	getnode(Null_field);
 	*Null_field = *Nnull_string;
 	Null_field->valref = 1;
-	Null_field->flags = (FIELD|STRCUR|STRING|NULL_FIELD);
+	Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */
 
 	field0_valid = true;
 }
@@ -133,7 +132,7 @@ set_field(long num,
 	n = fields_arr[num];
 	n->stptr = str;
 	n->stlen = len;
-	n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
+	n->flags = (STRCUR|STRING|USER_INPUT);	/* do not set MALLOC */
 }
 
 /* rebuild_record --- Someone assigned a value to $(something).
@@ -163,7 +162,7 @@ rebuild_record()
 	tlen += (NF - 1) * OFSlen;
 	if ((long) tlen < 0)
 		tlen = 0;
-	emalloc(ops, char *, tlen + 2, "rebuild_record");
+	emalloc(ops, char *, tlen + 1, "rebuild_record");
 	cops = ops;
 	ops[0] = '\0';
 	for (i = 1;  i <= NF; i++) {
@@ -196,29 +195,32 @@ rebuild_record()
 	 */
 	for (cops = ops, i = 1; i <= NF; i++) {
 		NODE *r = fields_arr[i];
-		if (r->stlen > 0) {
+		/*
+		 * There is no reason to copy malloc'ed fields to point into
+		 * the new $0 buffer, although that's how previous versions did
+		 * it. It seems faster to leave the malloc'ed fields in place.
+		 */
+		if (r->stlen > 0 && (r->flags & MALLOC) == 0) {
 			NODE *n;
 			getnode(n);
 
-			if ((r->flags & FIELD) == 0) {
-				*n = *Null_field;
-				n->stlen = r->stlen;
-				if ((r->flags & (NUMCUR|NUMBER)) != 0) {
-					n->flags |= (r->flags & (MPFN|MPZN|NUMCUR|NUMBER));
-#ifdef HAVE_MPFR
-					if (is_mpg_float(r)) {
-					        mpfr_init(n->mpg_numbr);
-						mpfr_set(n->mpg_numbr, r->mpg_numbr, ROUND_MODE);
-					} else if (is_mpg_integer(r)) {
-					        mpz_init(n->mpg_i);
-						mpz_set(n->mpg_i, r->mpg_i);
-					} else
-#endif
-					n->numbr = r->numbr;
-				}
-			} else {
-				*n = *r;
-				n->flags &= ~(MALLOC|STRING);
+			*n = *r;
+			if (r->valref > 1) {
+				/*
+				 * This probably never happens, since it
+				 * was not considered by previous versions of
+				 * this function. But it seems clear that
+				 * we can't leave r's stptr pointing into the
+				 * old $0 buffer that we are about to unref.
+				 * It's not a priori obvious that valref must be
+				 * 1 in all cases, so it seems wise to suppport
+				 * this corner case. The only question is
+				 * whether to add a warning message.
+				 */
+				emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record");
+				memcpy(r->stptr, cops, r->stlen);
+				r->stptr[r->stlen] = '\0';
+				r->flags |= MALLOC;
 			}
 
 			n->stptr = cops;
@@ -229,6 +231,10 @@ rebuild_record()
 		cops += fields_arr[i]->stlen + OFSlen;
 	}
 
+	assert((fields_arr[0]->flags & MALLOC) == 0
+		? fields_arr[0]->valref == 1
+		: true);
+
 	unref(fields_arr[0]);
 
 	fields_arr[0] = tmp;
@@ -254,7 +260,7 @@ set_record(const char *buf, int cnt)
 #define INITIAL_SIZE	512
 #define MAX_SIZE	((unsigned long) ~0)	/* maximally portable ... */
 
-	reset_record();
+	purge_record();
 
 	/* buffer management: */
 	if (databuf_size == 0) {	/* first time */
@@ -269,8 +275,11 @@ set_record(const char *buf, int cnt)
 	 * databuf_size is > cnt after allocation.
 	 */
 	if (cnt >= databuf_size) {
-		while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
+		do {
+			if (databuf_size > MAX_SIZE/2)
+				fatal(_("input record too large"));
 			databuf_size *= 2;
+		} while (cnt >= databuf_size);
 		erealloc(databuf, char *, databuf_size, "set_record");
 		memset(databuf, '\0', databuf_size);
 	}
@@ -278,20 +287,24 @@ set_record(const char *buf, int cnt)
 	memcpy(databuf, buf, cnt);
 
 	/*
-	 * Add terminating '\0' so that C library routines 
+	 * Add terminating '\0' so that C library routines
 	 * will know when to stop.
 	 */
 	databuf[cnt] = '\0';
 
 	/* manage field 0: */
+	assert((fields_arr[0]->flags & MALLOC) == 0
+		? fields_arr[0]->valref == 1
+		: true);
+
 	unref(fields_arr[0]);
 	getnode(n);
 	n->stptr = databuf;
 	n->stlen = cnt;
 	n->valref = 1;
 	n->type = Node_val;
-	n->stfmt = -1;
-	n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
+	n->stfmt = STFMT_UNUSED;
+	n->flags = (STRING|STRCUR|USER_INPUT);	/* do not set MALLOC */
 	fields_arr[0] = n;
 
 #undef INITIAL_SIZE
@@ -303,13 +316,21 @@ set_record(const char *buf, int cnt)
 void
 reset_record()
 {
+	fields_arr[0] = force_string(fields_arr[0]);
+	purge_record();
+}
+
+static void
+purge_record()
+{
 	int i;
 	NODE *n;
 
-	fields_arr[0] = force_string(fields_arr[0]);
-
 	NF = -1;
 	for (i = 1; i <= parse_high_water; i++) {
+		assert((fields_arr[i]->flags & MALLOC) == 0
+			? fields_arr[i]->valref == 1
+			: true);
 		unref(fields_arr[i]);
 		getnode(n);
 		*n = *Null_field;
@@ -341,7 +362,7 @@ set_NF()
 	assert(NF != -1);
 
 	(void) force_number(NF_node->var_value);
-	nf = get_number_si(NF_node->var_value); 
+	nf = get_number_si(NF_node->var_value);
 	if (nf < 0)
 		fatal(_("NF set to negative value"));
 	NF = nf;
@@ -409,7 +430,7 @@ re_parse_field(long up_to,	/* parse only up to this field number */
 		sep = scan;
 		while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
 			scan++;
-		if (sep_arr != NULL && sep < scan) 
+		if (sep_arr != NULL && sep < scan)
 			set_element(nf, sep, (long)(scan - sep), sep_arr);
 	}
 
@@ -441,8 +462,8 @@ re_parse_field(long up_to,	/* parse only up to this field number */
 		}
 		(*set)(++nf, field,
 		       (long)(scan + RESTART(rp, scan) - field), n);
-		if (sep_arr != NULL) 
-	    		set_element(nf, scan + RESTART(rp, scan), 
+		if (sep_arr != NULL)
+	    		set_element(nf, scan + RESTART(rp, scan),
            			(long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
 		scan += REEND(rp, scan);
 		field = scan;
@@ -506,7 +527,7 @@ def_parse_field(long up_to,	/* parse only up to this field number */
 	sep = scan;
 	for (; nf < up_to; scan++) {
 		/*
-		 * special case:  fs is single space, strip leading whitespace 
+		 * special case:  fs is single space, strip leading whitespace
 		 */
 		while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
 			scan++;
@@ -538,75 +559,6 @@ def_parse_field(long up_to,	/* parse only up to this field number */
 }
 
 /*
- * posix_def_parse_field --- default field parsing.
- *
- * This is called both from get_field() and from do_split()
- * via (*parse_field)().  This variation is for when FS is a single space
- * character.  The only difference between this and def_parse_field()
- * is that this one does not allow newlines to separate fields.
- */
-
-static long
-posix_def_parse_field(long up_to,	/* parse only up to this field number */
-	char **buf,	/* on input: string to parse; on output: point to start next */
-	int len,
-	NODE *fs,
-	Regexp *rp ATTRIBUTE_UNUSED,
-	Setfunc set,	/* routine to set the value of the parsed field */
-	NODE *n,
-	NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
-	bool in_middle ATTRIBUTE_UNUSED)
-{
-	char *scan = *buf;
-	long nf = parse_high_water;
-	char *field;
-	char *end = scan + len;
-	char sav;
-
-	if (up_to == UNLIMITED)
-		nf = 0;
-	if (len == 0)
-		return nf;
-
-	/*
-	 * Nasty special case. If FS set to "", return whole record
-	 * as first field. This is not worth a separate function.
-	 */
-	if (fs->stlen == 0) {
-		(*set)(++nf, *buf, len, n);
-		*buf += len;
-		return nf;
-	}
-
-	/* before doing anything save the char at *end */
-	sav = *end;
-	/* because it will be destroyed now: */
-
-	*end = ' ';	/* sentinel character */
-	for (; nf < up_to; scan++) {
-		/*
-		 * special case:  fs is single space, strip leading whitespace 
-		 */
-		while (scan < end && (*scan == ' ' || *scan == '\t'))
-			scan++;
-		if (scan >= end)
-			break;
-		field = scan;
-		while (*scan != ' ' && *scan != '\t')
-			scan++;
-		(*set)(++nf, field, (long)(scan - field), n);
-		if (scan == end)
-			break;
-	}
-
-	/* everything done, restore original char at *end */
-	*end = sav;
-
-	*buf = scan;
-	return nf;
-}
-
-/*
  * null_parse_field --- each character is a separate field
  *
  * This is called both from get_field() and from do_split()
@@ -857,11 +809,11 @@ get_field(long requested, Func_ptr *assign)
 	/*
 	 * Keep things uniform. Also, mere intention of assigning something
 	 * to $n should not make $0 invalid. Makes sense to invalidate $0
-	 * after the actual assignment is performed. Not a real issue in 
+	 * after the actual assignment is performed. Not a real issue in
 	 * the interpreter otherwise, but causes problem in the
 	 * debugger when watching or printing fields.
 	 */
-  
+
 	if (assign != NULL)
 		*assign = invalidate_field0;	/* $0 needs reconstruction */
 #endif
@@ -930,7 +882,7 @@ set_element(long num, char *s, long len, NODE *n)
 	NODE *sub;
 
 	it = make_string(s, len);
-	it->flags |= MAYBE_NUM;
+	it->flags |= USER_INPUT;
 	sub = make_number((AWKNUM) (num));
 	lhs = assoc_lookup(n, sub);
 	unref(*lhs);
@@ -977,12 +929,12 @@ do_split(int nargs)
 
 	if (sep_arr != NULL) {
 		if (sep_arr == arr)
-			fatal(_("split: cannot use the same array for second and fourth args")); 
+			fatal(_("split: cannot use the same array for second and fourth args"));
 
 		/* This checks need to be done before clearing any of the arrays */
 		for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 			if (tmp == arr)
-				fatal(_("split: cannot use a subarray of second arg for fourth arg"));	
+				fatal(_("split: cannot use a subarray of second arg for fourth arg"));
 		for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 			if (tmp == sep_arr)
 				fatal(_("split: cannot use a subarray of fourth arg for second arg"));
@@ -1000,6 +952,9 @@ do_split(int nargs)
 		return make_number((AWKNUM) 0);
 	}
 
+	if ((sep->flags & REGEX) != 0)
+		sep = sep->typed_re;
+
 	if (   (sep->re_flags & FS_DFLT) != 0
 	    && current_field_sep() == Using_FS
 	    && ! RS_is_null) {
@@ -1020,10 +975,7 @@ do_split(int nargs)
 			}
 		} else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
 			if (fs->stptr[0] == ' ') {
-				if (do_posix)
-					parseit = posix_def_parse_field;
-				else
-					parseit = def_parse_field;
+				parseit = def_parse_field;
 			} else
 				parseit = sc_parse_field;
 		} else {
@@ -1065,13 +1017,16 @@ do_patsplit(int nargs)
 
 	src = TOP_STRING();
 
+	if ((sep->flags & REGEX) != 0)
+		sep = sep->typed_re;
+
 	fpat = sep->re_exp;
 	if (fpat->stlen == 0)
 		fatal(_("patsplit: third argument must be non-null"));
 
 	if (sep_arr != NULL) {
 		if (sep_arr == arr)
-			fatal(_("patsplit: cannot use the same array for second and fourth args")); 
+			fatal(_("patsplit: cannot use the same array for second and fourth args"));
 
 		/* These checks need to be done before clearing any of the arrays */
 		for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
@@ -1138,7 +1093,7 @@ set_FIELDWIDTHS()
 	FIELDWIDTHS[0] = 0;
 	for (i = 1; ; i++) {
 		unsigned long int tmp;
-		if (i + 2 >= fw_alloc) {
+		if (i + 1 >= fw_alloc) {
 			fw_alloc *= 2;
 			erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
 		}
@@ -1165,7 +1120,7 @@ set_FIELDWIDTHS()
 		    	|| (*end != '\0' && ! is_blank(*end))
 				|| !(0 < tmp && tmp <= INT_MAX)
 		) {
-			fatal_error = true;	
+			fatal_error = true;
 			break;
 		}
 		FIELDWIDTHS[i] = tmp;
@@ -1236,7 +1191,7 @@ set_FS()
 	 * FS_regexp will be NULL with a non-null FS_re_yes_case.
 	 * refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
 	 * Please do not remerge.
-	 */ 
+	 */
 	refree(FS_re_yes_case);
 	refree(FS_re_no_case);
 	FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
@@ -1278,10 +1233,7 @@ choose_fs_function:
 			}
 		}
 	} else {
-		if (do_posix)
-			parse_field = posix_def_parse_field;
-		else
-			parse_field = def_parse_field;
+		parse_field = def_parse_field;
 
 		if (fs->stlen == 1) {
 			if (fs->stptr[0] == ' ')
@@ -1482,19 +1434,19 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs)
  * BEGIN {
  * 	false = 0
  * 	true = 1
- * 
+ *
  * 	fpat[1] = "([^,]*)|(\"[^\"]+\")"
  * 	fpat[2] = fpat[1]
  * 	fpat[3] = fpat[1]
  * 	fpat[4] = "aa+"
  * 	fpat[5] = fpat[4]
- * 
+ *
  * 	data[1] = "Robbins,,Arnold,"
  * 	data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
  * 	data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
  * 	data[4] = "bbbaaacccdddaaaaaqqqq"
  * 	data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
- * 
+ *
  * 	for (i = 1; i in data; i++) {
  * 		printf("Splitting: <%s>\n", data[i])
  * 		n = mypatsplit(data[i], fields, fpat[i], seps)
@@ -1505,7 +1457,7 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs)
  * 			printf("seps[%s] = <%s>\n", j, seps[j])
  * 	}
  * }
- * 
+ *
  * function mypatsplit(string, array, pattern, seps,
  * 			eosflag, non_empty, nf) # locals
  * {
@@ -1513,7 +1465,7 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs)
  * 	delete seps
  * 	if (length(string) == 0)
  * 		return 0
- * 
+ *
  * 	eosflag = non_empty = false
  * 	nf = 0
  * 	while (match(string, pattern)) {
@@ -1564,7 +1516,7 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs)
  * 	}
  * 	if (length(string) > 0)
  * 		seps[nf] = string
- * 
+ *
  * 	return length(array)
  * }
  */
@@ -1637,7 +1589,7 @@ fpat_parse_field(long up_to,	/* parse only up to this field number */
 			 * last match was non-empty, and at the
 			 * current character we get a zero length match,
 			 * which we don't want, so skip over it
-			 */ 
+			 */
 			non_empty = false;
 			if (sep_arr != NULL) {
 				need_to_set_sep = false;