Revise array sorting for PROCINFO["sorted_in"].

author: Arnold D. Robbins <arnold@skeeve.com> 2011-03-29 20:51:30 +0200
committer: Arnold D. Robbins <arnold@skeeve.com> 2011-03-29 20:51:30 +0200
commit: 2400cc5143383a881356a9f55e93b60037d851e5 (patch)
tree: 3ba75f8b1225bf50e48b4f6c381e7772f1ab6c5b /array.c
parent: 4fe569fb78dd1b25822c16c9cac515a0fc6702a4 (diff)
download: egawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.gz
egawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.bz2
egawk-2400cc5143383a881356a9f55e93b60037d851e5.zip
1 files changed, 545 insertions, 0 deletions
diff --git a/array.c b/array.c
index b821b5e4..0b03397c 100644
--- a/array.c
+++ b/array.c
@@ -54,6 +54,24 @@ static unsigned long awk_hash(const char *s, size_t len, unsigned long hsize, si
 
 unsigned long (*hash)(const char *s, size_t len, unsigned long hsize, size_t *code) = awk_hash;
 
+/* used by sort_maybe_numeric_index() and sort_up_index_number() */
+struct sort_num { AWKNUM sort_numbr; NODE *sort_index; };
+
+/* sorting support code */
+static qsort_compfunc sort_selection(NODE *, const char *, int);
+static size_t sort_match(const char *, size_t, const char *);
+static int sort_ignorecase(const char *, size_t, const char *, size_t);
+
+/* qsort callback routines */
+static int sort_up_index_string(const void *, const void *);
+static int sort_down_index_string(const void *, const void *);
+static int sort_up_index_ignrcase(const void *, const void *);
+static int sort_down_index_ignrcase(const void *, const void *);
+static int sort_up_index_number(const void *, const void *);
+static int sort_down_index_number(const void *, const void *);
+static int sort_up_value(const void *, const void *);
+static int sort_down_value(const void *, const void *);
+
 /* array_init --- possibly temporary function for experimentation purposes */
 
 void
@@ -1273,6 +1291,533 @@ do_asorti(int nargs)
 	return asort_actual(nargs, INDEX);
 }
 
+
+/* comp_func --- array index comparison function for qsort, used in debug.c */
+
+int
+comp_func(const void *p1, const void *p2)
+{
+	return sort_up_index_string(p1, p2);
+}
+
+/* sort_ignorecase --- case insensitive string comparison from cmp_nodes() */
+
+static int
+sort_ignorecase(const char *s1, size_t l1, const char *s2, size_t l2)
+{
+	size_t l;
+	int ret;
+
+	l = (l1 < l2) ? l1 : l2;
+#ifdef MBS_SUPPORT
+	if (gawk_mb_cur_max > 1) {
+		ret = strncasecmpmbs((const unsigned char *) s1,
+				     (const unsigned char *) s2, l);
+	} else
+#endif
+	for (ret = 0; l-- > 0 && ret == 0; s1++, s2++)
+		ret = casetable[*(unsigned char *) s1]
+		      - casetable[*(unsigned char *) s2];
+	if (ret == 0 && l1 != l2)
+		ret = (l1 < l2) ? -1 : 1;
+	return ret;
+}
+
+/*
+ * sort_up_index_string --- qsort comparison function; ascending index strings;
+ * index strings are distinct within an array, so no comparisons will yield
+ * equal and warrant disambiguation
+ */
+
+static int
+sort_up_index_string(const void *p1, const void *p2)
+{
+	const NODE *t1, *t2;
+	const char *str1, *str2;
+	size_t len1, len2;
+	int ret;
+
+	/* Array indexes are strings and distinct, never equal */
+	t1 = *((const NODE *const *) p1);
+	t2 = *((const NODE *const *) p2);
+
+	str1 = t1->ahname_str;
+	len1 = t1->ahname_len;
+	str2 = t2->ahname_str;
+	len2 = t2->ahname_len;
+
+	ret = memcmp(str1, str2, (len1 < len2) ? len1 : len2);
+	/*
+	 * if they compared equal but their lengths differ, the
+	 * shorter one is a prefix of the longer and compares as less
+	 */
+	if (ret == 0 && len1 != len2)
+		ret = (len1 < len2) ? -1 : 1;
+
+	/* indices are unique within an array, so result should never be 0 */
+	assert(ret != 0);
+	return ret;
+}
+
+/* sort_down_index_string --- descending index strings */
+
+static int
+sort_down_index_string(const void *p1, const void *p2)
+{
+	/*
+	 * Negation versus transposed arguments:  when all keys are
+	 * distinct, as with array indices here, either method will
+	 * transform an ascending sort into a descending one.  But if
+	 * there are equal keys--such as when IGNORECASE is honored--
+	 * that get disambiguated into a determisitc order, negation
+	 * will reverse those but transposed arguments would retain
+	 * their relative order within the rest of the reversed sort.
+	 */
+	return -sort_up_index_string(p1, p2);
+}
+
+/*
+ * sort_up_index_ignrcase --- ascending index string, case insensitive;
+ * case insensitive comparison can cause distinct index strings to compare
+ * equal, so disambiguation in necessary
+ */
+
+static int
+sort_up_index_ignrcase(const void *p1, const void *p2)
+{
+	const NODE *t1, *t2;
+	int ret;
+
+	t1 = *((const NODE *const *) p1);
+	t2 = *((const NODE *const *) p2);
+
+	ret = sort_ignorecase(t1->ahname_str, t1->ahname_len,
+			      t2->ahname_str, t2->ahname_len);
+
+	/*
+	 * if case insensitive result is "they're the same",
+	 * use case sensitive comparison to force distinct order
+	 */
+	if (ret == 0)
+		ret = sort_up_index_string(p1, p2);
+	return ret;
+}
+
+/* sort_down_index_ignrcase --- descending index strings, case insensitive */
+
+static int
+sort_down_index_ignrcase(const void *p1, const void *p2)
+{
+	return -sort_up_index_ignrcase(p1, p2);
+}
+
+/*
+ * sort_up_index_number --- qsort comparison function; ascending index numbers;
+ * the args have been built for us by sort_maybe_numeric_index() and point
+ * to custom structs rather than to gawk nodes
+ */
+
+static int
+sort_up_index_number(const void *p1, const void *p2)
+{
+	const struct sort_num *n1, *n2;
+	int ret;
+
+	n1 = *((const struct sort_num *const *) p1);
+	n2 = *((const struct sort_num *const *) p2);
+
+	if (n1->sort_numbr < n2->sort_numbr)
+		ret = -1;
+	else
+		ret = (n1->sort_numbr > n2->sort_numbr);
+
+	/* break a tie with the index string itself */
+	if (ret == 0)
+		return sort_up_index_string((const void *) n1->sort_index,
+					    (const void *) n2->sort_index);
+	return ret;
+}
+
+/* sort_down_index_number --- descending index numbers */
+
+static int
+sort_down_index_number(const void *p1, const void *p2)
+{
+	return -sort_up_index_number(p1, p2);
+}
+
+/*
+ * sort_up_value --- qsort comparison function; ascending values;
+ * standard awk comparison rules apply, so might be by number or might
+ * be by string depending upon the values involved; if a value happens
+ * to be a sub-array, it will compare greater than any scalar (last in
+ * ascending order) and equal with any other sub-array (disambiguated
+ * by index string, the same as with equal scalar values)
+ */
+
+static int
+sort_up_value(const void *p1, const void *p2)
+{
+	const NODE *idx1, *idx2;
+	NODE *val1, *val2;
+	int ret;
+
+	/* we're passed a pair of index (array subscript) nodes */
+	idx1 = *(const NODE *const *) p1;
+	idx2 = *(const NODE *const *) p2;
+	if (idx1->type != Node_ahash || idx2->type != Node_ahash)
+		cant_happen();
+
+	/* and we want to compare the element values they refer to */
+	val1 = idx1->hvalue;
+	val2 = idx2->hvalue;
+	if (val1->type == Node_var_array) {
+		if (val2->type == Node_var_array)
+			ret = 0;	/* both sub-arrays, hence tie */
+		else
+			ret = 1;	/* val1 (sub-array) > val2 (scalar) */
+	} else {
+		if (val2->type == Node_var_array)
+			ret = -1;	/* val1 (scalar) < val2 (sub-array) */
+		else
+			ret = cmp_nodes(val1, val2);	/* both scalars */
+	}
+
+	/* disambiguate ties to force a deterministic order */
+	if (ret == 0)
+		ret = sort_up_index_string(p1, p2);
+	return ret;
+}
+
+/* sort_down_value --- descending value maybe-nums */
+
+static int
+sort_down_value(const void *p1, const void *p2)
+{
+	return -sort_up_value(p1, p2);
+}
+
+/* sort_maybe_numeric_index --- special handling for sorting indices as numbers */
+
+void
+sort_maybe_numeric_index(qsort_compfunc func, NODE **list,
+			 size_t num_elems, int setup)
+{
+	static const NODE empty_node;
+
+	size_t i;
+	NODE temp_node;
+	struct sort_num *fake_node;
+
+	/* if we're not sorting indices by numeric value, do nothing */
+	if (func != sort_up_index_number && func != sort_down_index_number)
+		return;
+
+	/*
+	 * Insert a new struct in front of each index, so there's somewhere
+	 * to store its numeric value after that's calculated once, avoiding
+	 * having the qsort callback recalculate it for every comparison.
+	 * Our caller will call us back to destroy this intermediate array
+	 * as soon as qsort() finishes; the rest of gawk won't see it.
+	 * We could use--or perhaps mis-use--gawk nodes here, but they'd tie
+	 * up quite a bit more memory during the sort.
+	 */
+	if (setup) {
+		temp_node = empty_node;	/* init union fields to 0,NULL */
+		for (i = 0; i < num_elems; ++i) {
+			emalloc(fake_node, struct sort_num *,
+				sizeof (struct sort_num), "sort numeric index");
+			/* populate enough fields to satisfy force_number();
+			   we don't need to maintain reference counts */
+			temp_node.type = Node_val;
+			temp_node.stptr = list[i]->ahname_str;
+			temp_node.stlen = list[i]->ahname_len;
+			temp_node.flags = PERM | STRING | MAYBE_NUM;
+			/* insert the extra data in front of our input node */
+			fake_node->sort_numbr = force_number(&temp_node);
+			fake_node->sort_index = list[i];
+			list[i] = (NODE *) fake_node;
+		}
+	} else {	/* post sort cleanup */
+		for (i = 0; i < num_elems; ++i) {
+			/* undo the setup manipulations */
+			fake_node = (struct sort_num *) list[i];
+			list[i] = fake_node->sort_index;
+			efree((void *) fake_node);
+		}
+	}
+}
+
+/* sort_match --- compare leading part of a string against a sort option */
+
+static size_t
+sort_match(const char *str, size_t slen, const char *target)
+{
+	const char *endofword;
+
+	/*
+	 * STR comes from the user and is SLEN characters long;
+	 * if it has spaces then we care about the subset up until
+	 * its first space rather than the full length.
+	 * We check whether STR is the same as--or proper prefix
+	 * of--TARGET.
+	 * Accept "by-foo" or "as-foo" as a synonym match for "foo".
+	 */
+	endofword = strchr(str, ' ');
+	if (endofword != NULL)
+		slen = (size_t) (endofword - str);
+	/* try for exact match */
+	if (slen > 0 && strncasecmp(str, target, slen) == 0)
+		return slen;
+	/* ingore "by-" or "as-" prefix and try again */
+	if (slen > 3
+	    && (strncasecmp(str, "by-", 3) == 0
+		|| strncasecmp(str, "as-", 3) == 0)
+	    && strncasecmp(str + 3, target, slen - 3) == 0)
+		return slen;
+	/* no match */
+	return 0;
+}
+
+/*
+ * sort_selection --- parse user-specified sort ordering
+ * ("ascending index" or "value number" and so forth);
+ * returns a qsort comparison function
+ */
+
+static qsort_compfunc
+sort_selection(NODE *r, const char *warn_arg, int default_by_value)
+{
+	enum sort_bits {
+		unrecognized =    0,
+		Unsorted     = 0x01,
+		Ascending    = 0x02,
+		Descending   = 0x04,
+		by_Index     = 0x08,
+		by_Value     = 0x10,
+		as_String    = 0x20,
+		as_Number    = 0x40,
+		as_Inserted  = 0x80,
+		allbitsused  = 0xff
+	};
+	/*
+	 * The documented values are singular, but it's trivial to accept
+	 * "index numbers" and "descending values" since we're using a
+	 * prefix match.  Latin plural of index is the only complication.
+	 */
+	static const struct sort_keys {
+		const char *const keyword;
+		enum  sort_bits keybit;
+	} sorts[] = {
+		{ "unsorted",   Unsorted    },	/* not part of a three-part key */
+		{ "ascending",  Ascending   },	/* ascending vs descending */
+		{ "descending", Descending  },
+		{ "indexes",    by_Index    },	/* by_Index vs by_Number */
+		{ "indices",    by_Index    },	/* synonym for plural match */
+		{ "values",     by_Value    },
+		{ "strings",    as_String   },	/* as_String vs as_Number */
+		{ "numbers",    as_Number   },
+		{ "numeric",    as_Number   },	/* synonym; singular only */
+		{ "inserted",   as_Inserted },	/* not part of a three-part key */
+		{ "insertion",  as_Inserted },	/* another synonym */
+	};
+
+	static short warned_unrecognized = FALSE;
+	static enum sort_bits prev_invalid = unrecognized;
+
+	char *s;
+	size_t l, matchlen;
+	unsigned i;
+	const char *errfmt;
+	enum sort_bits allparts;
+	qsort_compfunc sort_func, default_sort_func;
+
+	/* deduce our caller from the args provided */
+	if (warn_arg != NULL)		/* for-in statement */
+		default_sort_func = (qsort_compfunc) 0;	/* unsorted */
+	else if (default_by_value)	/* asort() */
+		default_sort_func = sort_up_value;
+	else				/* asorti() */
+		default_sort_func = ! IGNORECASE ? sort_up_index_string
+						: sort_up_index_ignrcase;
+
+	r = force_string(r);
+	s = r->stptr;
+	l = r->stlen;
+
+	/* treat empty sort-order string as request for default */
+	if (l == 0)
+		return default_sort_func;
+
+	/* no ordering specification yet */
+	allparts = unrecognized;
+	/*
+	 * Scan through S until length L is exhausted, checking its
+	 * starting word against each possible ordering choice and
+	 * then skipping past that to get to the next word.  If no
+	 * choice matches, an error has been detected.  Duplicate
+	 * words are accepted; contradictory ones get noticed and
+	 * rejected after the string parsing has completed.
+	 */
+	while (l > 0) {
+		matchlen = 0;	/* lint suppression */
+		for (i = 0; i < (sizeof sorts / sizeof *sorts); ++i) {
+			matchlen = sort_match(s, l, sorts[i].keyword);
+			if (matchlen > 0) {
+				allparts |= sorts[i].keybit;
+				break;
+			}
+		}
+		if (matchlen == 0) {
+			/* failed to match any possible component */
+			allparts = unrecognized;
+			break;
+		}
+		/* move past the part just handled */
+		s += matchlen, l -= matchlen;
+		/* (l > 0) here would accept a trailing space; (l > 1) won't */
+		if (l > 1 && *s == ' ')
+			++s, --l;
+	} /* while (l > 0) */
+
+	/*
+	 * If we have one or two parts of a three part key, fill in
+	 * whichever parts are missing with default values.  For key
+	 * type the default is specified by our caller, for direction
+	 * it is Ascending, and for comparison mode it is as_String
+	 * but only when sorting by_Index.  When sorting by_Value,
+	 * comparison mode is out of the user's control and we don't
+	 * need or want to fill in either as_String or as_Number.
+	 */
+	if ((allparts & (Ascending|Descending|as_String|as_Number)) != 0
+	    && (allparts & (Unsorted|by_Index|by_Value|as_Inserted)) == 0)
+		allparts |= default_by_value ? by_Value : by_Index;
+	if ((allparts & (by_Index|by_Value|as_String|as_Number)) != 0
+	    && (allparts & (Unsorted|Ascending|Descending|as_Inserted)) == 0)
+		allparts |= Ascending;
+	/* by_Value is handled differently from by_Index */
+	if ((allparts & (Ascending|Descending|by_Index)) != 0
+	    && (allparts & (Unsorted|by_Value|as_String|as_Number|as_Inserted)) == 0)
+		allparts |= as_String;
+
+	/*
+	 * ALLPARTS is a bit mask of all the user's specified ordering
+	 * choices; partial specifications have been extended into full
+	 * three part key.  Valid combinations yield a qsort(3) comparison
+	 * routine result.  Invalid ones are rejected and yield default
+	 * sort, with an error message for asort()/asorti(), a warning
+	 * for for-in statement if this is a different invalid combination
+	 * than the most recently warned about one, or a one-time warning
+	 * if this is an unrecognized ordering specification and we're
+	 * running in lint checking mode.
+	 */
+	sort_func = default_sort_func;
+	errfmt = NULL;
+	switch (allparts) {
+	case Unsorted:
+		sort_func = (qsort_compfunc) 0;
+		break;
+	case Ascending|by_Index|as_String:
+		sort_func = ! IGNORECASE ? sort_up_index_string
+					: sort_up_index_ignrcase;
+		break;
+	case Descending|by_Index|as_String:
+		sort_func = ! IGNORECASE ? sort_down_index_string
+					: sort_down_index_ignrcase;
+		break;
+	case Ascending|by_Index|as_Number:
+		sort_func = sort_up_index_number;
+		break;
+	case Descending|by_Index|as_Number:
+		sort_func = sort_down_index_number;
+		break;
+	case Ascending|by_Value:
+		sort_func = sort_up_value;
+		break;
+	case Descending|by_Value:
+		sort_func = sort_down_value;
+		break;
+	/* for value sorts, distinction between string and number is not
+	   allowed; they're always ordered by basic awk MAYBE_NUM comparison */
+	case Ascending|by_Value|as_String:
+	case Descending|by_Value|as_String:
+		if (errfmt == NULL)
+			errfmt = _("%s: sorting by value can't be forced to use string comparison");
+		/* FALL THROUGH */
+	case Ascending|by_Value|as_Number:
+	case Descending|by_Value|as_Number:
+		if (errfmt == NULL)
+			errfmt = _("%s: sorting by value can't be forced to use numeric comparison");
+		/* FALL THROUGH */
+	/* gawk doesn't maintain the data necessary to sort by
+	   order of insertion */
+	case as_Inserted:
+		if (errfmt == NULL)
+			errfmt = _("%s: sort ordering \"as-inserted\" is not implemented");
+		/* FALL THROUGH */
+	default:
+		/* sort_func still has default value */
+		if (errfmt == NULL)
+			errfmt = (allparts == unrecognized)
+				? _("%s: sort specification is not recognized")
+				: _("%s: sort specification is invalid");
+		if (warn_arg == NULL) {
+			/* asort()/asorti() bad sort specification argument */
+			error(errfmt, default_by_value ? "asort" : "asorti");
+		} else if (allparts != unrecognized) {
+			/* invalid combination of valid ordering components */
+			if (allparts != prev_invalid)
+				warning(errfmt, warn_arg);
+			prev_invalid = allparts;
+		} else if (do_lint) {
+			/* unrecognized ordering specification string */              
+			if (! warned_unrecognized)
+				lintwarn(errfmt, warn_arg);
+			warned_unrecognized = TRUE;
+		}
+		break;
+	}
+	return sort_func;
+}
+
+/*
+ * sorted_in --- fetch value of PROCINFO["sorted_in"] and use sort_selection()
+ * to parse it; controls whether and how traversal of ``for (index in array)''
+ * should be sorted; returns a qsort comparison function
+ */
+
+qsort_compfunc
+sorted_in(void)		/* called from r_interpret(eval.c) */
+{
+	static NODE *sorted_str = NULL;
+	static short warned_extension = FALSE;
+
+	NODE *r;
+
+	/* if there's no PROCINFO[], there's no ["sorted_in"], so no sorting */
+	if (PROCINFO_node == NULL)
+		return (qsort_compfunc) 0;
+
+	if (sorted_str == NULL)		/* do this once */
+		sorted_str = make_string("sorted_in", 9);
+
+	r = (NODE *) NULL;
+	if (in_array(PROCINFO_node, sorted_str))
+		r = *assoc_lookup(PROCINFO_node, sorted_str, TRUE);
+	/* if there's no PROCINFO["sorted_in"], there's no sorting */
+	if (r == NULL)
+		return (qsort_compfunc) 0;
+
+	/* we're going to make use of PROCINFO["sorted_in"] */
+	if (do_lint && ! warned_extension) {
+		warned_extension = TRUE;
+		lintwarn(_("sorted array traversal is a gawk extension"));
+	}
+
+	return sort_selection(r, "`PROCINFO[\"sorted_in\"]'", FALSE);
+}
+
+
 /*
 From bonzini@gnu.org  Mon Oct 28 16:05:26 2002
 Date: Mon, 28 Oct 2002 13:33:03 +0100
author	Arnold D. Robbins <arnold@skeeve.com>	2011-03-29 20:51:30 +0200
committer	Arnold D. Robbins <arnold@skeeve.com>	2011-03-29 20:51:30 +0200
commit	2400cc5143383a881356a9f55e93b60037d851e5 (patch)
tree	3ba75f8b1225bf50e48b4f6c381e7772f1ab6c5b /array.c
parent	4fe569fb78dd1b25822c16c9cac515a0fc6702a4 (diff)
download	egawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.gz egawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.bz2 egawk-2400cc5143383a881356a9f55e93b60037d851e5.zip