1 files changed, 343 insertions, 366 deletions
diff --git a/field.c b/field.c
index 5f5b2b65..d8c97413 100644
--- a/field.c
+++ b/field.c
@@ -2,22 +2,22 @@
  * field.c - routines for dealing with fields and record parsing
  */
 
-/* 
+/*
  * Copyright (C) 1986, 1988, 1989, 1991-2016 the Free Software Foundation, Inc.
- * 
+ *
  * This file is part of GAWK, the GNU implementation of the
  * AWK Programming Language.
- * 
+ *
  * GAWK is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * GAWK is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
@@ -38,25 +38,34 @@ is_blank(int c)
 
 typedef void (* Setfunc)(long, char *, long, NODE *);
 
-static long (*parse_field)(long, char **, int, NODE *,
+/* is the API currently overriding the default parsing mechanism? */
+static bool api_parser_override = false;
+typedef long (*parse_field_func_t)(long, char **, int, NODE *,
 			     Regexp *, Setfunc, NODE *, NODE *, bool);
+static parse_field_func_t parse_field;
+/*
+ * N.B. The normal_parse_field function pointer contains the parse_field value
+ * that should be used except when API field parsing is overriding the default
+ * field parsing mechanism.
+ */
+static parse_field_func_t normal_parse_field;
 static long re_parse_field(long, char **, int, NODE *,
 			     Regexp *, Setfunc, NODE *, NODE *, bool);
 static long def_parse_field(long, char **, int, NODE *,
 			      Regexp *, Setfunc, NODE *, NODE *, bool);
-static long posix_def_parse_field(long, char **, int, NODE *,
-			      Regexp *, Setfunc, NODE *, NODE *, bool);
 static long null_parse_field(long, char **, int, NODE *,
 			     Regexp *, Setfunc, NODE *, NODE *, bool);
 static long sc_parse_field(long, char **, int, NODE *,
 			     Regexp *, Setfunc, NODE *, NODE *, bool);
 static long fw_parse_field(long, char **, int, NODE *,
 			     Regexp *, Setfunc, NODE *, NODE *, bool);
+static const awk_fieldwidth_info_t *api_fw = NULL;
 static long fpat_parse_field(long, char **, int, NODE *,
 			     Regexp *, Setfunc, NODE *, NODE *, bool);
 static void set_element(long num, char * str, long len, NODE *arr);
 static void grow_fields_arr(long num);
 static void set_field(long num, char *str, long len, NODE *dummy);
+static void purge_record(void);
 
 static char *parse_extent;	/* marks where to restart parse of record */
 static long parse_high_water = 0; /* field number that we have parsed so far */
@@ -65,7 +74,7 @@ static bool resave_fs;
 static NODE *save_FS;		/* save current value of FS when line is read,
 				 * to be used in deferred parsing
 				 */
-static int *FIELDWIDTHS = NULL;
+static awk_fieldwidth_info_t *FIELDWIDTHS = NULL;
 
 NODE **fields_arr;		/* array of pointers to the field nodes */
 bool field0_valid;		/* $(>0) has not been changed yet */
@@ -95,7 +104,7 @@ init_fields()
 	getnode(Null_field);
 	*Null_field = *Nnull_string;
 	Null_field->valref = 1;
-	Null_field->flags = (FIELD|STRCUR|STRING|NULL_FIELD);
+	Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */
 
 	field0_valid = true;
 }
@@ -133,7 +142,7 @@ set_field(long num,
 	n = fields_arr[num];
 	n->stptr = str;
 	n->stlen = len;
-	n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
+	n->flags = (STRCUR|STRING|USER_INPUT);	/* do not set MALLOC */
 }
 
 /* rebuild_record --- Someone assigned a value to $(something).
@@ -163,7 +172,7 @@ rebuild_record()
 	tlen += (NF - 1) * OFSlen;
 	if ((long) tlen < 0)
 		tlen = 0;
-	emalloc(ops, char *, tlen + 2, "rebuild_record");
+	emalloc(ops, char *, tlen + 1, "rebuild_record");
 	cops = ops;
 	ops[0] = '\0';
 	for (i = 1;  i <= NF; i++) {
@@ -196,29 +205,32 @@ rebuild_record()
 	 */
 	for (cops = ops, i = 1; i <= NF; i++) {
 		NODE *r = fields_arr[i];
-		if (r->stlen > 0) {
+		/*
+		 * There is no reason to copy malloc'ed fields to point into
+		 * the new $0 buffer, although that's how previous versions did
+		 * it. It seems faster to leave the malloc'ed fields in place.
+		 */
+		if (r->stlen > 0 && (r->flags & MALLOC) == 0) {
 			NODE *n;
 			getnode(n);
 
-			if ((r->flags & FIELD) == 0) {
-				*n = *Null_field;
-				n->stlen = r->stlen;
-				if ((r->flags & (NUMCUR|NUMBER)) != 0) {
-					n->flags |= (r->flags & (MPFN|MPZN|NUMCUR|NUMBER));
-#ifdef HAVE_MPFR
-					if (is_mpg_float(r)) {
-					        mpfr_init(n->mpg_numbr);
-						mpfr_set(n->mpg_numbr, r->mpg_numbr, ROUND_MODE);
-					} else if (is_mpg_integer(r)) {
-					        mpz_init(n->mpg_i);
-						mpz_set(n->mpg_i, r->mpg_i);
-					} else
-#endif
-					n->numbr = r->numbr;
-				}
-			} else {
-				*n = *r;
-				n->flags &= ~(MALLOC|STRING);
+			*n = *r;
+			if (r->valref > 1) {
+				/*
+				 * This probably never happens, since it
+				 * was not considered by previous versions of
+				 * this function. But it seems clear that
+				 * we can't leave r's stptr pointing into the
+				 * old $0 buffer that we are about to unref.
+				 * It's not a priori obvious that valref must be
+				 * 1 in all cases, so it seems wise to suppport
+				 * this corner case. The only question is
+				 * whether to add a warning message.
+				 */
+				emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record");
+				memcpy(r->stptr, cops, r->stlen);
+				r->stptr[r->stlen] = '\0';
+				r->flags |= MALLOC;
 			}
 
 			n->stptr = cops;
@@ -229,6 +241,10 @@ rebuild_record()
 		cops += fields_arr[i]->stlen + OFSlen;
 	}
 
+	assert((fields_arr[0]->flags & MALLOC) == 0
+		? fields_arr[0]->valref == 1
+		: true);
+
 	unref(fields_arr[0]);
 
 	fields_arr[0] = tmp;
@@ -246,7 +262,7 @@ rebuild_record()
  * but better correct than fast.
  */
 void
-set_record(const char *buf, int cnt)
+set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *fw)
 {
 	NODE *n;
 	static char *databuf;
@@ -254,14 +270,12 @@ set_record(const char *buf, int cnt)
 #define INITIAL_SIZE	512
 #define MAX_SIZE	((unsigned long) ~0)	/* maximally portable ... */
 
-	reset_record();
+	purge_record();
 
 	/* buffer management: */
 	if (databuf_size == 0) {	/* first time */
-		emalloc(databuf, char *, INITIAL_SIZE, "set_record");
+		ezalloc(databuf, char *, INITIAL_SIZE, "set_record");
 		databuf_size = INITIAL_SIZE;
-		memset(databuf, '\0', INITIAL_SIZE);
-
 	}
 	/*
 	 * Make sure there's enough room. Since we sometimes need
@@ -269,8 +283,11 @@ set_record(const char *buf, int cnt)
 	 * databuf_size is > cnt after allocation.
 	 */
 	if (cnt >= databuf_size) {
-		while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
+		do {
+			if (databuf_size > MAX_SIZE/2)
+				fatal(_("input record too large"));
 			databuf_size *= 2;
+		} while (cnt >= databuf_size);
 		erealloc(databuf, char *, databuf_size, "set_record");
 		memset(databuf, '\0', databuf_size);
 	}
@@ -278,21 +295,38 @@ set_record(const char *buf, int cnt)
 	memcpy(databuf, buf, cnt);
 
 	/*
-	 * Add terminating '\0' so that C library routines 
+	 * Add terminating '\0' so that C library routines
 	 * will know when to stop.
 	 */
 	databuf[cnt] = '\0';
 
 	/* manage field 0: */
+	assert((fields_arr[0]->flags & MALLOC) == 0
+		? fields_arr[0]->valref == 1
+		: true);
+
 	unref(fields_arr[0]);
 	getnode(n);
 	n->stptr = databuf;
 	n->stlen = cnt;
 	n->valref = 1;
 	n->type = Node_val;
-	n->stfmt = -1;
-	n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
+	n->stfmt = STFMT_UNUSED;
+	n->flags = (STRING|STRCUR|USER_INPUT);	/* do not set MALLOC */
 	fields_arr[0] = n;
+	if (fw != api_fw) {
+		if ((api_fw = fw) != NULL) {
+			if (! api_parser_override) {
+				api_parser_override = true;
+				parse_field = fw_parse_field;
+				update_PROCINFO_str("FS", "API");
+			}
+		} else if (api_parser_override) {
+			api_parser_override = false;
+			parse_field = normal_parse_field;
+			update_PROCINFO_str("FS", current_field_sep_str());
+		}
+	}
 
 #undef INITIAL_SIZE
 #undef MAX_SIZE
@@ -303,13 +337,21 @@ set_record(const char *buf, int cnt)
 void
 reset_record()
 {
+	fields_arr[0] = force_string(fields_arr[0]);
+	purge_record();
+}
+
+static void
+purge_record()
+{
 	int i;
 	NODE *n;
 
-	fields_arr[0] = force_string(fields_arr[0]);
-
 	NF = -1;
 	for (i = 1; i <= parse_high_water; i++) {
+		assert((fields_arr[i]->flags & MALLOC) == 0
+			? fields_arr[i]->valref == 1
+			: true);
 		unref(fields_arr[i]);
 		getnode(n);
 		*n = *Null_field;
@@ -341,7 +383,7 @@ set_NF()
 	assert(NF != -1);
 
 	(void) force_number(NF_node->var_value);
-	nf = get_number_si(NF_node->var_value); 
+	nf = get_number_si(NF_node->var_value);
 	if (nf < 0)
 		fatal(_("NF set to negative value"));
 	NF = nf;
@@ -409,7 +451,7 @@ re_parse_field(long up_to,	/* parse only up to this field number */
 		sep = scan;
 		while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
 			scan++;
-		if (sep_arr != NULL && sep < scan) 
+		if (sep_arr != NULL && sep < scan)
 			set_element(nf, sep, (long)(scan - sep), sep_arr);
 	}
 
@@ -441,8 +483,8 @@ re_parse_field(long up_to,	/* parse only up to this field number */
 		}
 		(*set)(++nf, field,
 		       (long)(scan + RESTART(rp, scan) - field), n);
-		if (sep_arr != NULL) 
-	    		set_element(nf, scan + RESTART(rp, scan), 
+		if (sep_arr != NULL)
+	    		set_element(nf, scan + RESTART(rp, scan),
            			(long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
 		scan += REEND(rp, scan);
 		field = scan;
@@ -506,7 +548,7 @@ def_parse_field(long up_to,	/* parse only up to this field number */
 	sep = scan;
 	for (; nf < up_to; scan++) {
 		/*
-		 * special case:  fs is single space, strip leading whitespace 
+		 * special case:  fs is single space, strip leading whitespace
 		 */
 		while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
 			scan++;
@@ -538,75 +580,6 @@ def_parse_field(long up_to,	/* parse only up to this field number */
 }
 
 /*
- * posix_def_parse_field --- default field parsing.
- *
- * This is called both from get_field() and from do_split()
- * via (*parse_field)().  This variation is for when FS is a single space
- * character.  The only difference between this and def_parse_field()
- * is that this one does not allow newlines to separate fields.
- */
-
-static long
-posix_def_parse_field(long up_to,	/* parse only up to this field number */
-	char **buf,	/* on input: string to parse; on output: point to start next */
-	int len,
-	NODE *fs,
-	Regexp *rp ATTRIBUTE_UNUSED,
-	Setfunc set,	/* routine to set the value of the parsed field */
-	NODE *n,
-	NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
-	bool in_middle ATTRIBUTE_UNUSED)
-{
-	char *scan = *buf;
-	long nf = parse_high_water;
-	char *field;
-	char *end = scan + len;
-	char sav;
-
-	if (up_to == UNLIMITED)
-		nf = 0;
-	if (len == 0)
-		return nf;
-
-	/*
-	 * Nasty special case. If FS set to "", return whole record
-	 * as first field. This is not worth a separate function.
-	 */
-	if (fs->stlen == 0) {
-		(*set)(++nf, *buf, len, n);
-		*buf += len;
-		return nf;
-	}
-
-	/* before doing anything save the char at *end */
-	sav = *end;
-	/* because it will be destroyed now: */
-
-	*end = ' ';	/* sentinel character */
-	for (; nf < up_to; scan++) {
-		/*
-		 * special case:  fs is single space, strip leading whitespace 
-		 */
-		while (scan < end && (*scan == ' ' || *scan == '\t'))
-			scan++;
-		if (scan >= end)
-			break;
-		field = scan;
-		while (*scan != ' ' && *scan != '\t')
-			scan++;
-		(*set)(++nf, field, (long)(scan - field), n);
-		if (scan == end)
-			break;
-	}
-
-	/* everything done, restore original char at *end */
-	*end = sav;
-
-	*buf = scan;
-	return nf;
-}
-
-/*
  * null_parse_field --- each character is a separate field
  *
  * This is called both from get_field() and from do_split()
@@ -739,6 +712,31 @@ sc_parse_field(long up_to,	/* parse only up to this field number */
 }
 
 /*
+ * calc_mbslen --- calculate the length in bytes of a multi-byte string
+ * containing len characters.
+ */
+
+static size_t
+calc_mbslen(char *scan, char *end, size_t len, mbstate_t *mbs)
+{
+
+	size_t mbclen;
+	char *mbscan = scan;
+
+	while (len-- > 0 && mbscan < end) {
+		mbclen = mbrlen(mbscan, end - mbscan, mbs);
+		if (!(mbclen > 0 && mbclen <= (size_t)(end - mbscan)))
+			/*
+			 * We treat it as a singlebyte character. This should
+			 * catch error codes 0, (size_t) -1, and (size_t) -2.
+			 */
+			mbclen = 1;
+		mbscan += mbclen;
+	}
+	return mbscan - scan;
+}
+
+/*
  * fw_parse_field --- field parsing using FIELDWIDTHS spec
  *
  * This is called from get_field() via (*parse_field)().
@@ -758,53 +756,53 @@ fw_parse_field(long up_to,	/* parse only up to this field number */
 	char *scan = *buf;
 	long nf = parse_high_water;
 	char *end = scan + len;
-	int nmbc;
-	size_t mbclen;
-	size_t mbslen;
-	size_t lenrest;
-	char *mbscan;
+	const awk_fieldwidth_info_t *fw;
 	mbstate_t mbs;
+	size_t skiplen;
+	size_t flen;
 
-	memset(&mbs, 0, sizeof(mbstate_t));
+	fw = (api_parser_override ? api_fw : FIELDWIDTHS);
 
 	if (up_to == UNLIMITED)
 		nf = 0;
 	if (len == 0)
 		return nf;
-	for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
-		if (gawk_mb_cur_max > 1) {
-			nmbc = 0;
-			mbslen = 0;
-			mbscan = scan;
-			lenrest = end - scan;
-			while (nmbc < len && mbslen < lenrest) {
-				mbclen = mbrlen(mbscan, end - mbscan, &mbs);
-				if (   mbclen == 1
-				    || mbclen == (size_t) -1
-				    || mbclen == (size_t) -2
-				    || mbclen == 0) {
-					/* We treat it as a singlebyte character.  */
-		    			mbclen = 1;
-				}
-				if (mbclen <= end - mbscan) {
-					mbscan += mbclen;
-		    			mbslen += mbclen;
-		    			++nmbc;
-				}
-	    		}
-			(*set)(++nf, scan, (long) mbslen, n);
-			scan += mbslen;
-		} else {
-			if (len > end - scan)
-				len = end - scan;
-			(*set)(++nf, scan, (long) len, n);
-			scan += len;
+	if (gawk_mb_cur_max > 1 && fw->use_chars) {
+		/*
+		 * Reset the shift state. Arguably, the shift state should
+		 * be part of the file state and carried forward at all times,
+		 * but nobody has complained so far, so this may not matter
+		 * in practice.
+		 */
+		memset(&mbs, 0, sizeof(mbstate_t));
+		while (nf < up_to && scan < end) {
+			if (nf >= fw->nf) {
+				*buf = end;
+				return nf;
+			}
+			scan += calc_mbslen(scan, end, fw->fields[nf].skip, &mbs);
+			flen = calc_mbslen(scan, end, fw->fields[nf].len, &mbs);
+			(*set)(++nf, scan, (long) flen, n);
+			scan += flen;
+		}
+	} else {
+		while (nf < up_to && scan < end) {
+			if (nf >= fw->nf) {
+				*buf = end;
+				return nf;
+			}
+			skiplen = fw->fields[nf].skip;
+			if (skiplen > end - scan)
+				skiplen = end - scan;
+			scan += skiplen;
+			flen = fw->fields[nf].len;
+			if (flen > end - scan)
+				flen = end - scan;
+			(*set)(++nf, scan, (long) flen, n);
+			scan += flen;
 		}
 	}
-	if (len == -1)
-		*buf = end;
-	else
-		*buf = scan;
+	*buf = scan;
 	return nf;
 }
 
@@ -857,11 +855,11 @@ get_field(long requested, Func_ptr *assign)
 	/*
 	 * Keep things uniform. Also, mere intention of assigning something
 	 * to $n should not make $0 invalid. Makes sense to invalidate $0
-	 * after the actual assignment is performed. Not a real issue in 
+	 * after the actual assignment is performed. Not a real issue in
 	 * the interpreter otherwise, but causes problem in the
 	 * debugger when watching or printing fields.
 	 */
-  
+
 	if (assign != NULL)
 		*assign = invalidate_field0;	/* $0 needs reconstruction */
 #endif
@@ -893,7 +891,7 @@ get_field(long requested, Func_ptr *assign)
 		if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
 			NF = parse_high_water;
 		else if (parse_field == fpat_parse_field) {
-			/* FPAT parsing is wierd, isolate the special cases */
+			/* FPAT parsing is weird, isolate the special cases */
 			char *rec_start = fields_arr[0]->stptr;
 			char *rec_end = fields_arr[0]->stptr + fields_arr[0]->stlen;
 
@@ -930,7 +928,7 @@ set_element(long num, char *s, long len, NODE *n)
 	NODE *sub;
 
 	it = make_string(s, len);
-	it->flags |= MAYBE_NUM;
+	it->flags |= USER_INPUT;
 	sub = make_number((AWKNUM) (num));
 	lhs = assoc_lookup(n, sub);
 	unref(*lhs);
@@ -977,12 +975,12 @@ do_split(int nargs)
 
 	if (sep_arr != NULL) {
 		if (sep_arr == arr)
-			fatal(_("split: cannot use the same array for second and fourth args")); 
+			fatal(_("split: cannot use the same array for second and fourth args"));
 
 		/* This checks need to be done before clearing any of the arrays */
 		for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 			if (tmp == arr)
-				fatal(_("split: cannot use a subarray of second arg for fourth arg"));	
+				fatal(_("split: cannot use a subarray of second arg for fourth arg"));
 		for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 			if (tmp == sep_arr)
 				fatal(_("split: cannot use a subarray of fourth arg for second arg"));
@@ -1000,6 +998,9 @@ do_split(int nargs)
 		return make_number((AWKNUM) 0);
 	}
 
+	if ((sep->flags & REGEX) != 0)
+		sep = sep->typed_re;
+
 	if (   (sep->re_flags & FS_DFLT) != 0
 	    && current_field_sep() == Using_FS
 	    && ! RS_is_null) {
@@ -1020,10 +1021,7 @@ do_split(int nargs)
 			}
 		} else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
 			if (fs->stptr[0] == ' ') {
-				if (do_posix)
-					parseit = posix_def_parse_field;
-				else
-					parseit = def_parse_field;
+				parseit = def_parse_field;
 			} else
 				parseit = sc_parse_field;
 		} else {
@@ -1065,13 +1063,16 @@ do_patsplit(int nargs)
 
 	src = TOP_STRING();
 
+	if ((sep->flags & REGEX) != 0)
+		sep = sep->typed_re;
+
 	fpat = sep->re_exp;
 	if (fpat->stlen == 0)
 		fatal(_("patsplit: third argument must be non-null"));
 
 	if (sep_arr != NULL) {
 		if (sep_arr == arr)
-			fatal(_("patsplit: cannot use the same array for second and fourth args")); 
+			fatal(_("patsplit: cannot use the same array for second and fourth args"));
 
 		/* These checks need to be done before clearing any of the arrays */
 		for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
@@ -1102,6 +1103,18 @@ do_patsplit(int nargs)
 	return tmp;
 }
 
+/* set_parser --- update the current (non-API) parser */
+
+static void
+set_parser(parse_field_func_t func)
+{
+	normal_parse_field = func;
+	if (! api_parser_override && parse_field != func) {
+		parse_field = func;
+	        update_PROCINFO_str("FS", current_field_sep_str());
+	}
+}
+
 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
 
 void
@@ -1123,27 +1136,27 @@ set_FIELDWIDTHS()
 		return;
 
 	/*
-	 * If changing the way fields are split, obey least-suprise
+	 * If changing the way fields are split, obey least-surprise
 	 * semantics, and force $0 to be split totally.
 	 */
 	if (fields_arr != NULL)
 		(void) get_field(UNLIMITED - 1, 0);
 
-	parse_field = fw_parse_field;
+	set_parser(fw_parse_field);
 	tmp = force_string(FIELDWIDTHS_node->var_value);
 	scan = tmp->stptr;
 
-	if (FIELDWIDTHS == NULL)
-		emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
-	FIELDWIDTHS[0] = 0;
-	for (i = 1; ; i++) {
+	if (FIELDWIDTHS == NULL) {
+		emalloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
+		FIELDWIDTHS->use_chars = awk_true;
+	}
+	FIELDWIDTHS->nf = 0;
+	for (i = 0; ; i++) {
 		unsigned long int tmp;
-		if (i + 2 >= fw_alloc) {
+		if (i >= fw_alloc) {
 			fw_alloc *= 2;
-			erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
+			erealloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS");
 		}
-		/* Initialize value to be end of list */
-		FIELDWIDTHS[i] = -1;
 		/* Ensure that there is no leading `-' sign.  Otherwise,
 		   strtoul would accept it and return a bogus result.  */
 		while (is_blank(*scan)) {
@@ -1156,19 +1169,47 @@ set_FIELDWIDTHS()
 		if (*scan == '\0')
 			break;
 
-		/* Detect an invalid base-10 integer, a valid value that
-		   is followed by something other than a blank or '\0',
-		   or a value that is not in the range [1..INT_MAX].  */
+		// Look for skip value. We allow N:M and N:*.
+		/*
+		 * Detect an invalid base-10 integer, a valid value that
+		 * is followed by something other than a blank or '\0',
+		 * or a value that is not in the range [1..UINT_MAX].
+		 */
 		errno = 0;
 		tmp = strtoul(scan, &end, 10);
+		if (errno == 0 && *end == ':' && (0 < tmp && tmp <= UINT_MAX)) {
+			FIELDWIDTHS->fields[i].skip = tmp;
+			scan = end + 1;
+			if (*scan == '-' || is_blank(*scan)) {
+				fatal_error = true;
+				break;
+			}
+			// try scanning for field width
+			tmp = strtoul(scan, &end, 10);
+		}
+		else
+			FIELDWIDTHS->fields[i].skip = 0;
+
 		if (errno != 0
 		    	|| (*end != '\0' && ! is_blank(*end))
-				|| !(0 < tmp && tmp <= INT_MAX)
+				|| !(0 < tmp && tmp <= UINT_MAX)
 		) {
-			fatal_error = true;	
+			if (*scan == '*') {
+				for (scan++; is_blank(*scan); scan++)
+					continue;
+
+				if (*scan != '\0')
+					fatal(_("`*' must be the last designator in FIELDWIDTHS"));
+
+				FIELDWIDTHS->fields[i].len = UINT_MAX;
+				FIELDWIDTHS->nf = i+1;
+			}
+			else
+				fatal_error = true;
 			break;
 		}
-		FIELDWIDTHS[i] = tmp;
+		FIELDWIDTHS->fields[i].len = tmp;
+		FIELDWIDTHS->nf = i+1;
 		scan = end;
 		/* Skip past any trailing blanks.  */
 		while (is_blank(*scan)) {
@@ -1177,12 +1218,10 @@ set_FIELDWIDTHS()
 		if (*scan == '\0')
 			break;
 	}
-	FIELDWIDTHS[i+1] = -1;
 
-	update_PROCINFO_str("FS", "FIELDWIDTHS");
 	if (fatal_error)
-		fatal(_("invalid FIELDWIDTHS value, near `%s'"),
-			      scan);
+		fatal(_("invalid FIELDWIDTHS value, for field %d, near `%s'"),
+			      i + 1, scan);
 }
 
 /* set_FS --- handle things when FS is assigned to */
@@ -1236,7 +1275,7 @@ set_FS()
 	 * FS_regexp will be NULL with a non-null FS_re_yes_case.
 	 * refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
 	 * Please do not remerge.
-	 */ 
+	 */
 	refree(FS_re_yes_case);
 	refree(FS_re_no_case);
 	FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
@@ -1250,7 +1289,7 @@ choose_fs_function:
 	if (! do_traditional && fs->stlen == 0) {
 		static bool warned = false;
 
-		parse_field = null_parse_field;
+		set_parser(null_parse_field);
 
 		if (do_lint && ! warned) {
 			warned = true;
@@ -1259,10 +1298,10 @@ choose_fs_function:
 	} else if (fs->stlen > 1) {
 		if (do_lint_old)
 			warning(_("old awk does not support regexps as value of `FS'"));
-		parse_field = re_parse_field;
+		set_parser(re_parse_field);
 	} else if (RS_is_null) {
 		/* we know that fs->stlen <= 1 */
-		parse_field = sc_parse_field;
+		set_parser(sc_parse_field);
 		if (fs->stlen == 1) {
 			if (fs->stptr[0] == ' ') {
 				default_FS = true;
@@ -1278,10 +1317,7 @@ choose_fs_function:
 			}
 		}
 	} else {
-		if (do_posix)
-			parse_field = posix_def_parse_field;
-		else
-			parse_field = def_parse_field;
+		set_parser(def_parse_field);
 
 		if (fs->stlen == 1) {
 			if (fs->stptr[0] == ' ')
@@ -1290,7 +1326,7 @@ choose_fs_function:
 				/* same special case */
 				strcpy(buf, "[\\\\]");
 			else
-				parse_field = sc_parse_field;
+				set_parser(sc_parse_field);
 		}
 	}
 	if (remake_re) {
@@ -1302,7 +1338,7 @@ choose_fs_function:
 			FS_re_yes_case = make_regexp(buf, strlen(buf), false, true, true);
 			FS_re_no_case = make_regexp(buf, strlen(buf), true, true, true);
 			FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
-			parse_field = re_parse_field;
+			set_parser(re_parse_field);
 		} else if (parse_field == re_parse_field) {
 			FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, false, true, true);
 			FS_re_no_case = make_regexp(fs->stptr, fs->stlen, true, true, true);
@@ -1318,16 +1354,16 @@ choose_fs_function:
 	 */
 	if (fs->stlen == 1 && parse_field == re_parse_field)
 		FS_regexp = FS_re_yes_case;
-
-	update_PROCINFO_str("FS", "FS");
 }
 
-/* current_field_sep --- return what field separator is */
+/* current_field_sep --- return the field separator type */
 
 field_sep_type
 current_field_sep()
 {
-	if (parse_field == fw_parse_field)
+	if (api_parser_override)
+		return Using_API;
+	else if (parse_field == fw_parse_field)
 		return Using_FIELDWIDTHS;
 	else if (parse_field == fpat_parse_field)
 		return Using_FPAT;
@@ -1335,6 +1371,21 @@ current_field_sep()
 		return Using_FS;
 }
 
+/* current_field_sep_str --- return the field separator type as a string */
+
+const char *
+current_field_sep_str()
+{
+	if (api_parser_override)
+		return "API";
+	else if (parse_field == fw_parse_field)
+		return "FIELDWIDTHS";
+	else if (parse_field == fpat_parse_field)
+		return "FPAT";
+	else
+		return "FS";
+}
+
 /* update_PROCINFO_str --- update PROCINFO[sub] with string value */
 
 void
@@ -1421,7 +1472,7 @@ set_FPAT()
 
 set_fpat_function:
 	fpat = force_string(FPAT_node->var_value);
-	parse_field = fpat_parse_field;
+	set_parser(fpat_parse_field);
 
 	if (remake_re) {
 		refree(FPAT_re_yes_case);
@@ -1432,8 +1483,6 @@ set_fpat_function:
 		FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, true, true, true);
 		FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
 	}
-
-	update_PROCINFO_str("FS", "FPAT");
 }
 
 /*
@@ -1471,101 +1520,65 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs)
  * via (*parse_field)().  This variation is for when FPAT is a regular
  * expression -- use the value to find field contents.
  *
- * This was really hard to get right.  It happens to bear many resemblances
- * to issues I had with getting gsub right with null matches. When dealing
- * with that I prototyped in awk and had the foresight to save the awk code
- * over in the C file.  Starting with that as a base, I finally got to this
- * awk code to do what I needed, and then translated it into C. Fortunately
- * the C code bears a closer correspondance to the awk code here than over
- * by gsub.
+ * The FPAT parsing logic is a bit difficult to specify. In particular
+ * to allow null fields at certain locations. To make the code as robust
+ * as possible, an awk reference implementation was written and tested
+ * as a first step, and later recoded in C, preserving its structure as
+ * much as possible.
  *
- * BEGIN {
- * 	false = 0
- * 	true = 1
- * 
- * 	fpat[1] = "([^,]*)|(\"[^\"]+\")"
- * 	fpat[2] = fpat[1]
- * 	fpat[3] = fpat[1]
- * 	fpat[4] = "aa+"
- * 	fpat[5] = fpat[4]
- * 
- * 	data[1] = "Robbins,,Arnold,"
- * 	data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
- * 	data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
- * 	data[4] = "bbbaaacccdddaaaaaqqqq"
- * 	data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
+ * # Reference implementation of the FPAT record parsing.
+ * #
+ * # Each loop iteration identifies a (separator[n-1],field[n]) pair.
+ * # Each loop iteration must consume some characters, except for the first field.
+ * # So a null field is only valid as a first field or after a non-null separator.
+ * # A null record has no fields (not a single null field).
  * 
- * 	for (i = 1; i in data; i++) {
- * 		printf("Splitting: <%s>\n", data[i])
- * 		n = mypatsplit(data[i], fields, fpat[i], seps)
- * 		print "n =", n
- * 		for (j = 1; j <= n; j++)
- * 			printf("fields[%d] = <%s>\n", j, fields[j])
- * 		for (j = 0; j in seps; j++)
- * 			printf("seps[%s] = <%s>\n", j, seps[j])
- * 	}
- * }
- * 
- * function mypatsplit(string, array, pattern, seps,
- * 			eosflag, non_empty, nf) # locals
+ * function refpatsplit(string, fields, pattern, seps,
+ *         parse_start, sep_start, field_start, field_length, field_found, nf) # locals
  * {
- * 	delete array
- * 	delete seps
- * 	if (length(string) == 0)
- * 		return 0
- * 
- * 	eosflag = non_empty = false
- * 	nf = 0
- * 	while (match(string, pattern)) {
- * 		if (RLENGTH > 0) {	# easy case
- * 			non_empty = true
- * 			if (! (nf in seps)) {
- * 				if (RSTART == 1)	# match at front of string
- * 					seps[nf] = ""
- * 				else
- * 					seps[nf] = substr(string, 1, RSTART - 1)
- * 			}
- * 			array[++nf] = substr(string, RSTART, RLENGTH)
- * 			string = substr(string, RSTART+RLENGTH)
- * 			if (length(string) == 0)
- * 				break
- * 		} else if (non_empty) {
- * 			# last match was non-empty, and at the
- * 			# current character we get a zero length match,
- * 			# which we don't want, so skip over it
- * 			non_empty = false
- * 			seps[nf] = substr(string, 1, 1)
- * 			string = substr(string, 2)
- * 		} else {
- * 			# 0 length match
- * 			if (! (nf in seps)) {
- * 				if (RSTART == 1)
- * 					seps[nf] = ""
- * 				else
- * 					seps[nf] = substr(string, 1, RSTART - 1)
- * 			}
- * 			array[++nf] = ""
- * 			if (! non_empty && ! eosflag) { # prev was empty
- * 				seps[nf] = substr(string, 1, 1)
- * 			}
- * 			if (RSTART == 1) {
- * 				string = substr(string, 2)
- * 			} else {
- * 				string = substr(string, RSTART + 1)
- * 			}
- * 			non_empty = false
- * 		}
- * 		if (length(string) == 0) {
- * 			if (eosflag)
- * 				break
- * 			else
- * 				eosflag = true
- * 		}
- * 	}
- * 	if (length(string) > 0)
- * 		seps[nf] = string
+ *     # Local state variables:
+ *     # - parse_start: pointer to the first not yet consumed character
+ *     # - sep_start: pointer to the beginning of the parsed separator
+ *     # - field start: pointer to the beginning of the parsed field
+ *     # - field length: length of the parsed field
+ *     # - field_found: flag for succesful field match
+ *     # - nf: Number of fields found so far
+ *     
+ *     # Prepare for parsing
+ *     parse_start = 1   # first not yet parsed char
+ *     nf = 0            # fields found so far
+ *     delete fields
+ *     delete seps
  * 
- * 	return length(array)
+ *     # Loop that consumes the whole record
+ *     while (parse_start <= length(string)) {  # still something to parse
+ *     
+ *         # first attempt to match the next field
+ *         sep_start = parse_start
+ *         field_found = match(substr(string, parse_start), pattern)
+ *         
+ *         # check for an invalid null field and retry one character away
+ *         if (nf > 0 && field_found && RSTART==1 && RLENGTH==0) {
+ *             parse_start++
+ *             field_found = match(substr(string, parse_start), pattern)
+ *         }
+ *         
+ *         # store the (sep[n-1],field[n]) pair
+ *         if (field_found) {
+ *             field_start = parse_start + RSTART - 1
+ *             field_length = RLENGTH
+ *             seps[nf] = substr(string, sep_start, field_start-sep_start)
+ *             fields[++nf] = substr(string, field_start, field_length)
+ *             parse_start = field_start + field_length
+ *             
+ *         # store the final extra sep after the last field
+ *         } else {
+ *             seps[nf] = substr(string, sep_start)
+ *             parse_start = length(string) + 1
+ *         }
+ *     }
+ *     
+ *     return nf
  * }
  */
 static long
@@ -1584,10 +1597,9 @@ fpat_parse_field(long up_to,	/* parse only up to this field number */
 	char *start;
 	char *end = scan + len;
 	int regex_flags = RE_NEED_START;
-	bool need_to_set_sep;
-	bool non_empty;
-	bool eosflag;
 	mbstate_t mbs;
+	char* field_start;
+	bool field_found = false;
 
 	memset(&mbs, 0, sizeof(mbstate_t));
 
@@ -1600,90 +1612,55 @@ fpat_parse_field(long up_to,	/* parse only up to this field number */
 	if (rp == NULL) /* use FPAT */
 		rp = FPAT_regexp;
 
-	if (in_middle) {
-		regex_flags |= RE_NO_BOL;
-	}
-	non_empty = rp->non_empty;
+	while (scan < end && nf < up_to) {  /* still something to parse */
 
-	eosflag = false;
-	need_to_set_sep = true;
-	start = scan;
-	while (research(rp, scan, 0, (end - scan), regex_flags) != -1
-	       && nf < up_to) {
+		/* first attempt to match the next field */
+		start = scan;
+		field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
 
-		if (REEND(rp, scan) > RESTART(rp, scan)) { /* if (RLENGTH > 0) */
-			non_empty = true;
-			if (sep_arr != NULL && need_to_set_sep) {
-				if (RESTART(rp, scan) == 0) /* match at front */
-		    			set_element(nf, start, 0L, sep_arr);
+		/* check for an invalid null field and retry one character away */ 
+		if (nf > 0 && field_found && REEND(rp, scan) == 0) { /* invalid null field */
+			increment_scan(& scan, end - scan);
+			field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1;
+		}
+
+		/* store the (sep[n-1],field[n]) pair */
+		if (field_found) {
+			field_start = scan + RESTART(rp, scan);
+			if (sep_arr != NULL) { /* store the separator */
+				if (field_start == start) /* match at front */
+					set_element(nf, start, 0L, sep_arr);
 				else
-		    			set_element(nf,
+					set_element(nf,
 						start,
-						(long) RESTART(rp, scan),
+						(long) (field_start - start),
 						sep_arr);
 			}
 			/* field is text that matched */
 			(*set)(++nf,
-				scan + RESTART(rp, scan),
+				field_start,
 				(long)(REEND(rp, scan) - RESTART(rp, scan)),
 				n);
-
 			scan += REEND(rp, scan);
-			if (scan >= end)
-				break;
-			need_to_set_sep = true;
-		} else if (non_empty) { /* else if non_empty */
-			/*
-			 * last match was non-empty, and at the
-			 * current character we get a zero length match,
-			 * which we don't want, so skip over it
-			 */ 
-			non_empty = false;
-			if (sep_arr != NULL) {
-				need_to_set_sep = false;
-		    		set_element(nf, start, 1L, sep_arr);
-			}
-			increment_scan(& scan, end - scan);
+
 		} else {
-			/* 0 length match */
-			if (sep_arr != NULL && need_to_set_sep) {
-				if (RESTART(rp, scan) == 0) /* RSTART == 1 */
-		    			set_element(nf, start, 0L, sep_arr);
-				else
-		    			set_element(nf, start,
-							(long) RESTART(rp, scan),
-							sep_arr);
-			}
-			need_to_set_sep = true;
-			(*set)(++nf, scan, 0L, n);
-			if (! non_empty && ! eosflag) { /* prev was empty */
-				if (sep_arr != NULL) {
-		    			set_element(nf, start, 1L, sep_arr);
-					need_to_set_sep = false;
-				}
-			}
-			if (RESTART(rp, scan) == 0)
-				increment_scan(& scan, end - scan);
-			else {
-				scan += RESTART(rp, scan);
-			}
-			non_empty = false;
-		}
-		if (scan >= end) { /* length(string) == 0 */
-			if (eosflag)
-				break;
-			else
-				eosflag = true;
+			/*
+			 * No match, store the final extra separator after
+			 * the last field.
+			 */
+			if (sep_arr != NULL)
+				set_element(nf, start, (long) (end - start), sep_arr);
+			scan = end;
 		}
-
-		start = scan;
-	}
-	if (scan < end) {
-		if (sep_arr != NULL)
-    			set_element(nf, scan, (long) (end - scan), sep_arr);
 	}
 
+	/*
+	 * If the last field extends up to the end of the record, generate
+	 * a null trailing separator
+	 */
+	if (sep_arr != NULL && scan == end && field_found) 
+		set_element(nf, scan, 0L, sep_arr);
+
 	*buf = scan;
-	rp->non_empty = non_empty;
 	return nf;
 }