More array sorting changes from John.

author: Arnold D. Robbins <arnold@skeeve.com> 2011-04-18 10:23:54 +0300
committer: Arnold D. Robbins <arnold@skeeve.com> 2011-04-18 10:23:54 +0300
commit: bcb8bd6d6671a3400c4bfe3e34eb4d5d66050f32 (patch)
tree: 41cf2343944c2836150422b1bafabdaa73ff32bb /array.c
parent: 98eb78c2d05870952fe25bbe37b86df2017f42fc (diff)
download: egawk-bcb8bd6d6671a3400c4bfe3e34eb4d5d66050f32.tar.gz
egawk-bcb8bd6d6671a3400c4bfe3e34eb4d5d66050f32.tar.bz2
egawk-bcb8bd6d6671a3400c4bfe3e34eb4d5d66050f32.zip
1 files changed, 451 insertions, 674 deletions
diff --git a/array.c b/array.c
index 5d1c5884..15053bb0 100644
--- a/array.c
+++ b/array.c
@@ -54,24 +54,16 @@ static unsigned long awk_hash(const char *s, size_t len, unsigned long hsize, si
 
 unsigned long (*hash)(const char *s, size_t len, unsigned long hsize, size_t *code) = awk_hash;
 
-/* used by sort_maybe_numeric_index() and sort_up_index_number() */
-struct sort_num { AWKNUM sort_numbr; NODE *sort_index; };
-
-/* sorting support code */
-static qsort_compfunc sort_selection(NODE *, const char *, int);
-static size_t sort_match(const char *, size_t, const char *);
-static int sort_ignorecase(const char *, size_t, const char *, size_t);
-static int sort_cmp_nodes(NODE *, NODE *);
-
-/* qsort callback routines */
+/* qsort comparison function */
 static int sort_up_index_string(const void *, const void *);
 static int sort_down_index_string(const void *, const void *);
-static int sort_up_index_ignrcase(const void *, const void *);
-static int sort_down_index_ignrcase(const void *, const void *);
 static int sort_up_index_number(const void *, const void *);
 static int sort_down_index_number(const void *, const void *);
-static int sort_up_value(const void *, const void *);
-static int sort_down_value(const void *, const void *);
+static int sort_up_value_string(const void *, const void *);
+static int sort_down_value_string(const void *, const void *);
+static int sort_up_value_number(const void *, const void *);
+static int sort_down_value_number(const void *, const void *);
+
 
 /* array_init --- possibly temporary function for experimentation purposes */
 
@@ -951,9 +943,9 @@ do_adump(int nargs)
  * ajb@woti.com.
  */
 
-/* dup_table --- duplicate input symbol table "symbol" */
+/* dup_table --- recursively duplicate input array "symbol" */
 
-static void
+static NODE *
 dup_table(NODE *symbol, NODE *newsymb)
 {
 	NODE **old, **new, *chain, *bucket;
@@ -986,6 +978,7 @@ dup_table(NODE *symbol, NODE *newsymb)
 					bucket->type = Node_ahash;
 					bucket->flags |= MALLOC;
 					bucket->ahname_ref = 1;
+					bucket->ahcode = chain->ahcode;
 
 					/*
 					 * copy the corresponding name and
@@ -1007,8 +1000,7 @@ dup_table(NODE *symbol, NODE *newsymb)
 						emalloc(aname, char *, aname_len + 2, "dup_table");
 						sprintf(aname, "%s[\"%.*s\"]", newsymb->vname, (int) chain->ahname_len, chain->ahname_str);
 						r->vname = aname;
-						dup_table(chain->ahvalue, r);
-						bucket->ahvalue = r;
+						bucket->ahvalue = dup_table(chain->ahvalue, r);
 					} else
 						bucket->ahvalue = dupnode(chain->ahvalue);
 
@@ -1026,237 +1018,30 @@ dup_table(NODE *symbol, NODE *newsymb)
 
 	newsymb->var_array = new;
 	newsymb->array_size = cursize;
-}
-
-/* merge --- do a merge of two sorted lists */
-
-static NODE *
-merge(NODE *left, NODE *right)
-{
-	NODE *ans, *cur;
-
-	/*
-	 * Use sort_cmp_nodes(): see the comment there as to why. That
-	 * function will call cmp_nodes() on strings, which means that
-	 * IGNORECASE influences the comparison.  This is OK, but it may
-	 * be surprising.  This comment serves to remind us that we
-	 * know about this and that it's OK.
-	 */
-	if (sort_cmp_nodes(left->ahvalue, right->ahvalue) <= 0) {
-		ans = cur = left;
-		left = left->ahnext;
-	} else {
-		ans = cur = right;
-		right = right->ahnext;
-	}
-
-	while (left != NULL && right != NULL) {
-		if (cmp_nodes(left->ahvalue, right->ahvalue) <= 0) {
-			cur->ahnext = left;
-			cur = left;
-			left  = left->ahnext;
-		} else {
-			cur->ahnext = right;
-			cur = right;
-			right = right->ahnext;
-		}
-	}
-
-	cur->ahnext = (left != NULL ? left : right);
-
-	return ans;
-}
-
-/* merge_sort --- recursively sort the left and right sides of a list */
-
-static NODE *
-merge_sort(NODE *left, unsigned long size)
-{
-	NODE *right, *tmp;
-	int i, half;
-
-	if (size <= 1)
-		return left;
-
-	/* walk down the list, till just one before the midpoint */
-	tmp = left;
-	half = size / 2;
-	for (i = 0; i < half-1; i++)
-		tmp = tmp->ahnext;
-
-	/* split the list into two parts */
-	right = tmp->ahnext;
-	tmp->ahnext = NULL;
-
-	/* sort the left and right parts of the list */
-	left  = merge_sort(left,       half);
-	right = merge_sort(right, size-half);
-
-	/* merge the two sorted parts of the list */
-	return merge(left, right);
-}
-
-
-/*
- * assoc_from_list -- Populate an array with the contents of a list of NODEs, 
- * using increasing integers as the key.
- */
-
-static void
-assoc_from_list(NODE *symbol, NODE *list)
-{
-	NODE *next;
-	unsigned long i = 0;
-	unsigned long hash1;
-	char buf[100];
-
-	for (; list != NULL; list = next) {
-		size_t code;
-
-		next = list->ahnext;
-
-		/* make an int out of i++ */
-		i++;
-		sprintf(buf, "%lu", i);
-		assert(list->ahname_str == NULL);
-		assert(list->ahname_ref == 1);
-		emalloc(list->ahname_str, char *, strlen(buf) + 2, "assoc_from_list");
-		list->ahname_len = strlen(buf);
-		strcpy(list->ahname_str, buf);
-
-		/* find the bucket where it belongs */
-		hash1 = hash(list->ahname_str, list->ahname_len,
-				symbol->array_size, & code);
-		list->ahcode = code;
-
-		/* link the node into the chain at that bucket */
-		list->ahnext = symbol->var_array[hash1];
-		symbol->var_array[hash1] = list;
-	}
-}
-
-/*
- * assoc_sort_inplace --- sort all the values in symbol[], replacing
- * the sorted values back into symbol[], indexed by integers starting with 1.
- */
-
-typedef enum asort_how { VALUE, INDEX } ASORT_TYPE;
-
-static NODE *
-assoc_sort_inplace(NODE *symbol, ASORT_TYPE how)
-{
-	unsigned long i, num;
-	NODE *bucket, *next, *list;
-
-	if (symbol->var_array == NULL
-	    || symbol->array_size <= 0
-	    || symbol->table_size <= 0)
-		return make_number((AWKNUM) 0);
-
-	/* build a linked list out of all the entries in the table */
-	if (how == VALUE) {
-		list = NULL;
-		num = 0;
-		for (i = 0; i < symbol->array_size; i++) {
-			for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
-				if (bucket->ahvalue->type == Node_var_array)
-					fatal(_("attempt to use array in a scalar context"));
-				next = bucket->ahnext;
-				if (bucket->ahname_ref == 1) {
-					efree(bucket->ahname_str);
-					bucket->ahname_str = NULL;
-					bucket->ahname_len = 0;
-				} else {
-					NODE *r;
-
-					getnode(r);
-					*r = *bucket;
-					ahash_unref(bucket);
-					bucket = r;
-					bucket->flags |= MALLOC;
-					bucket->ahname_ref = 1;
-					bucket->ahname_str = NULL;
-					bucket->ahname_len = 0;
-				}
-				bucket->ahnext = list;
-				list = bucket;
-				num++;
-			}
-			symbol->var_array[i] = NULL;
-		}
-	} else {	/* how == INDEX */
-		list = NULL;
-		num = 0;
-		for (i = 0; i < symbol->array_size; i++) {
-			for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
-				next = bucket->ahnext;
-
-				/* toss old value */
-				if (bucket->ahvalue->type == Node_var_array) {
-					NODE *r = bucket->ahvalue;
-					assoc_clear(r);
-					efree(r->vname);
-					freenode(r);
-				} else
-					unref(bucket->ahvalue);
-
-				/* move index into value */
-				if (bucket->ahname_ref == 1) {
-					bucket->ahvalue = make_str_node(bucket->ahname_str,
-								bucket->ahname_len, ALREADY_MALLOCED);
-					bucket->ahname_str = NULL;
-					bucket->ahname_len = 0;
-				} else {
-					NODE *r;
-
-					bucket->ahvalue = make_string(bucket->ahname_str, bucket->ahname_len);
-					getnode(r);
-					*r = *bucket;
-					ahash_unref(bucket);
-					bucket = r;
-					bucket->flags |= MALLOC;
-					bucket->ahname_ref = 1;
-					bucket->ahname_str = NULL;
-					bucket->ahname_len = 0;
-				}
-
-				bucket->ahnext = list;
-				list = bucket;
-				num++;
-			}
-			symbol->var_array[i] = NULL;
-		}
-	}
-
-	/*
-	 * Sort the linked list of NODEs.
-	 * (The especially nice thing about using a merge sort here is that
-	 * we require absolutely no additional storage. This is handy if the
-	 * array has grown to be very large.)
-	 */
-	list = merge_sort(list, num);
-
-	/*
-	 * now repopulate the original array, using increasing
-	 * integers as the key
-	 */
-	assoc_from_list(symbol, list);
-
-	return make_number((AWKNUM) num);
+	newsymb->flags = symbol->flags;	/* ARRAYMAXED */
+	return newsymb;
 }
 
 /* asort_actual --- do the actual work to sort the input array */
 
 static NODE *
-asort_actual(int nargs, ASORT_TYPE how)
+asort_actual(int nargs, SORT_CTXT ctxt)
 {
-	NODE *dest = NULL;
-	NODE *array;
-
-	if (nargs == 2) {  /* 2nd optional arg */
+	NODE *array, *dest = NULL, *result;
+	NODE *r, *subs, *sort_str;
+	NODE **list, **ptr;
+	static char buf[102];
+	unsigned long num_elems, i;
+
+	if (nargs == 3)  /* 3rd optional arg */
+		sort_str = POP_STRING();
+	else
+		sort_str = Nnull_string;	/* "" => default sorting */
+ 
+	if (nargs >= 2) {  /* 2nd optional arg */
 		dest = POP_PARAM();
 		if (dest->type != Node_var_array) {
-			fatal(how == VALUE ?
+			fatal(ctxt == ASORT ?
 				_("asort: second argument not an array") :
 				_("asorti: second argument not an array"));
 		}
@@ -1264,18 +1049,89 @@ asort_actual(int nargs, ASORT_TYPE how)
 
 	array = POP_PARAM();
 	if (array->type != Node_var_array) {
-		fatal(how == VALUE ?
+		fatal(ctxt == ASORT ?
 			_("asort: first argument not an array") :
 			_("asorti: first argument not an array"));
 	}
 
+	num_elems = array->table_size;
+	if (num_elems == 0 || array->var_array == NULL) {	/* source array is empty */
+		if (dest != NULL && dest != array)
+			assoc_clear(dest);
+		return make_number((AWKNUM) 0);
+	}
+
+	list = assoc_list(array, sort_str, ctxt);
+	DEREF(sort_str);
+
+	/* Must not assoc_clear() the source array before constructing
+	 * the output array. assoc_list() does not duplicate array values
+	 * which are needed for asort().
+	 */
+
 	if (dest != NULL && dest != array) {
 		assoc_clear(dest);
-		dup_table(array, dest);
-		array = dest;
+		result = dest;
+	} else {
+		/* use 'result' as a temporary destination array */
+		getnode(result);
+		memset(result, '\0', sizeof(NODE));
+		result->type = Node_var_array;
+		result->vname = array->vname;
 	}
 
-	return assoc_sort_inplace(array, how);
+	subs = make_str_node(buf, 100, ALREADY_MALLOCED);   /* fake it */
+	subs->flags &= ~MALLOC;		/* safety */
+	for (i = 1, ptr = list; i <= num_elems; i++) {
+		sprintf(buf, "%lu", i);
+		subs->stlen = strlen(buf);
+		r = *ptr++;
+		if (ctxt == ASORTI) {
+			/* We want the indices of the source array as values
+			 * of the 'result' array.
+			 */
+
+			*assoc_lookup(result, subs, FALSE) =
+						make_string(r->ahname_str, r->ahname_len);
+		} else {
+			NODE *val;
+
+			/* We want the values of the source array. */
+
+			val = r->ahvalue;
+			if (val->type == Node_val)
+				*assoc_lookup(result, subs, FALSE) = dupnode(val);
+			else {
+				const char *arr_name = make_aname(result, subs);
+				NODE *arr;
+
+				/* there isn't any reference counting for subarrays, so
+				 * recursively copy subarrays using dup_table().
+				 */
+				getnode(arr);
+				arr->type = Node_var_array;
+				arr->var_array = NULL;
+				arr->vname = estrdup(arr_name, strlen(arr_name));
+				*assoc_lookup(result, subs, FALSE) = dup_table(val, arr);
+			}
+		}
+
+		ahash_unref(r);
+ 	}
+
+	freenode(subs);	/* stptr(buf) not malloc-ed */
+	efree(list);
+
+	if (result != dest) {
+		/* dest == NULL or dest == array */
+		assoc_clear(array);
+		*array = *result;	/* copy result into array */
+		freenode(result);
+	} /* else
+		result == dest
+		dest != NULL and dest != array */
+
+	return make_number((AWKNUM) num_elems);
 }
 
 /* do_asort --- sort array by value */
@@ -1283,7 +1139,7 @@ asort_actual(int nargs, ASORT_TYPE how)
 NODE *
 do_asort(int nargs)
 {
-	return asort_actual(nargs, VALUE);
+	return asort_actual(nargs, ASORT);
 }
 
 /* do_asorti --- sort array by index */
@@ -1291,127 +1147,83 @@ do_asort(int nargs)
 NODE *
 do_asorti(int nargs)
 {
-	return asort_actual(nargs, INDEX);
-}
-
-
-/* comp_func --- array index comparison function for qsort, used in debug.c */
-
-int
-comp_func(const void *p1, const void *p2)
-{
-	return sort_up_index_string(p1, p2);
+	return asort_actual(nargs, ASORTI);
 }
 
-/*
- * sort_cmp_nodes --- compare two nodes.  Unlike cmp_nodes(), we don't
- * mix numeric and string comparisons.  Numbers compare as less than
- * strings, which in turn compare as less than sub-arrays.  All
- * sub-arrays compare equal to each other, regardless of their contents.
+/* cmp_string --- compare two strings; logic similar to cmp_nodes() in eval.c
+ *	except the extra case-sensitive comparison when the case-insensitive result
+ *	is a match.
  */
 
 static int
-sort_cmp_nodes(NODE *n1, NODE *n2)
+cmp_string(const NODE *n1, const NODE *n2)
 {
+	char *s1, *s2;
+	size_t len1, len2;
 	int ret;
-
-	if (n1->type == Node_var_array) {
-		/* return 0 if n2 is a sub-array too, else return 1 */
-		ret = (n2->type != Node_var_array);
-	} else if (n2->type == Node_var_array) {
-		ret = -1;	/* n1 (scalar) < n2 (sub-array) */
+	size_t lmin;
+
+	assert(n1->type == n2->type);
+	if (n1->type == Node_ahash) {
+		s1 = n1->ahname_str;
+		len1 = n1->ahname_len;
+		s2 =  n2->ahname_str;
+		len2 = n2->ahname_len;
 	} else {
-		/*
-		 * Both scalars; we can't settle for a regular
-		 * MAYBE_NUM comparison because that can cause
-		 * problems when strings fall between numbers
-		 * whose value makes them be ordered differently
-		 * when handled as strings than as numbers.
-		 * For example, { 10, 5, "3D" } yields a cycle:
-		 * 5 < 10, "10" < "3D", "3D" < "5", so would
-		 * produce different sort results depending upon
-		 * the order in which comparisons between pairs
-		 * of values happen to be performed.  We force
-		 * numbers to be less than strings in order to
-		 * avoid that: 5 < 10, 10 < "3D", 5 < "3D".
-		 */
-		/* first resolve any undecided types */
-		if (n1->flags & MAYBE_NUM)
-			(void) force_number(n1);
-		if (n2->flags & MAYBE_NUM)
-			(void) force_number(n2);
-		/*
-		 * If both types are the same, use cmp_nodes();
-		 * otherwise, force the numeric value to be less
-		 * than the string one.
-		 */
-		if ((n1->flags & NUMBER) == (n2->flags & NUMBER))
-			ret = cmp_nodes(n1, n2);
-		else
-			ret = (n1->flags & NUMBER) ? -1 : 1;
+		s1 = n1->stptr;
+		len1 = n1->stlen;
+		s2 =  n2->stptr;
+		len2 = n2->stlen;
 	}
-	return ret;
-}
 
-/* sort_ignorecase --- case insensitive string comparison from cmp_nodes() */
+	if (len1 == 0)
+		return len2 == 0 ? 0 : -1;
+	if (len2 == 0)
+		return 1;
 
-static int
-sort_ignorecase(const char *s1, size_t l1, const char *s2, size_t l2)
-{
-	size_t l;
-	int ret;
+	/* len1 > 0 && len2 > 0 */
+	lmin = len1 < len2 ? len1 : len2;
+
+	if (IGNORECASE) {
+		const unsigned char *cp1 = (const unsigned char *) s1;
+		const unsigned char *cp2 = (const unsigned char *) s2;
 
-	l = (l1 < l2) ? l1 : l2;
 #ifdef MBS_SUPPORT
-	if (gawk_mb_cur_max > 1) {
-		ret = strncasecmpmbs((const unsigned char *) s1,
-				     (const unsigned char *) s2, l);
-	} else
+		if (gawk_mb_cur_max > 1) {
+			ret = strncasecmpmbs((const unsigned char *) cp1,
+					     (const unsigned char *) cp2, lmin);
+		} else
 #endif
-	for (ret = 0; l-- > 0 && ret == 0; s1++, s2++)
-		ret = casetable[*(unsigned char *) s1]
-		      - casetable[*(unsigned char *) s2];
-	if (ret == 0 && l1 != l2)
-		ret = (l1 < l2) ? -1 : 1;
-	return ret;
+		for (ret = 0; lmin-- > 0 && ret == 0; cp1++, cp2++)
+			ret = casetable[*cp1] - casetable[*cp2];
+		if (ret != 0)
+			return ret;
+		/* if case insensitive result is "they're the same",
+		 * use case sensitive comparison to force distinct order
+		 */
+	}
+
+	ret = memcmp(s1, s2, lmin);
+	if (ret != 0 || len1 == len2)
+		return ret;
+	return (len1 < len2) ? -1 : 1;
 }
 
-/*
- * sort_up_index_string --- qsort comparison function; ascending index strings;
- * index strings are distinct within an array, so no comparisons will yield
- * equal and warrant disambiguation
- */
+
+/* sort_up_index_string --- qsort comparison function; ascending index strings. */
 
 static int
 sort_up_index_string(const void *p1, const void *p2)
 {
 	const NODE *t1, *t2;
-	const char *str1, *str2;
-	size_t len1, len2;
-	int ret;
 
-	/* Array indexes are strings and distinct, never equal */
+	/* Array indexes are strings */
 	t1 = *((const NODE *const *) p1);
 	t2 = *((const NODE *const *) p2);
-
-	str1 = t1->ahname_str;
-	len1 = t1->ahname_len;
-	str2 = t2->ahname_str;
-	len2 = t2->ahname_len;
-
-	ret = memcmp(str1, str2, (len1 < len2) ? len1 : len2);
-	/*
-	 * if they compared equal but their lengths differ, the
-	 * shorter one is a prefix of the longer and compares as less
-	 */
-	if (ret == 0 && len1 != len2)
-		ret = (len1 < len2) ? -1 : 1;
-
-	/* indices are unique within an array, so result should never be 0 */
-	assert(ret != 0);
-	return ret;
+	return cmp_string(t1, t2);
 }
 
+
 /* sort_down_index_string --- descending index strings */
 
 static int
@@ -1429,69 +1241,30 @@ sort_down_index_string(const void *p1, const void *p2)
 	return -sort_up_index_string(p1, p2);
 }
 
-/*
- * sort_up_index_ignrcase --- ascending index string, case insensitive;
- * case insensitive comparison can cause distinct index strings to compare
- * equal, so disambiguation in necessary
- */
-
-static int
-sort_up_index_ignrcase(const void *p1, const void *p2)
-{
-	const NODE *t1, *t2;
-	int ret;
-
-	t1 = *((const NODE *const *) p1);
-	t2 = *((const NODE *const *) p2);
-
-	ret = sort_ignorecase(t1->ahname_str, t1->ahname_len,
-			      t2->ahname_str, t2->ahname_len);
-
-	/*
-	 * if case insensitive result is "they're the same",
-	 * use case sensitive comparison to force distinct order
-	 */
-	if (ret == 0)
-		ret = sort_up_index_string(p1, p2);
-	return ret;
-}
-
-/* sort_down_index_ignrcase --- descending index strings, case insensitive */
-
-static int
-sort_down_index_ignrcase(const void *p1, const void *p2)
-{
-	return -sort_up_index_ignrcase(p1, p2);
-}
 
-/*
- * sort_up_index_number --- qsort comparison function; ascending index numbers;
- * the args have been built for us by sort_maybe_numeric_index() and point
- * to custom structs rather than to gawk nodes
- */
+/* sort_up_index_number --- qsort comparison function; ascending index numbers. */
 
 static int
 sort_up_index_number(const void *p1, const void *p2)
 {
-	const struct sort_num *n1, *n2;
+	const NODE *n1, *n2;
 	int ret;
 
-	n1 = *((const struct sort_num *const *) p1);
-	n2 = *((const struct sort_num *const *) p2);
+	n1 = *((const NODE *const *) p1);
+	n2 = *((const NODE *const *) p2);
 
-	if (n1->sort_numbr < n2->sort_numbr)
+	if (n1->ahname_num < n2->ahname_num)
 		ret = -1;
 	else
-		ret = (n1->sort_numbr > n2->sort_numbr);
+		ret = (n1->ahname_num > n2->ahname_num);
 
 	/* break a tie with the index string itself */
 	if (ret == 0)
-		return sort_up_index_string((const void *) &n1->sort_index,
-					    (const void *) &n2->sort_index);
+		return cmp_string(n1, n2);
 	return ret;
 }
 
-/* sort_down_index_number --- descending index numbers */
+/* sort_down_index_number --- qsort comparison function; descending index numbers */
 
 static int
 sort_down_index_number(const void *p1, const void *p2)
@@ -1499,144 +1272,131 @@ sort_down_index_number(const void *p1, const void *p2)
 	return -sort_up_index_number(p1, p2);
 }
 
-/* sort_up_value --- qsort comparison function; ascending values */
+
+/* sort_up_value_string --- qsort comparison function; ascending value string */
 
 static int
-sort_up_value(const void *p1, const void *p2)
+sort_up_value_string(const void *p1, const void *p2)
 {
-	const NODE *idx1, *idx2;
-	NODE *val1, *val2;
-	int ret;
+	const NODE *t1, *t2;
+	NODE *n1, *n2;
 
 	/* we're passed a pair of index (array subscript) nodes */
-	idx1 = *(const NODE *const *) p1;
-	idx2 = *(const NODE *const *) p2;
-	if (idx1->type != Node_ahash || idx2->type != Node_ahash)
-		cant_happen();
+	t1 = *(const NODE *const *) p1;
+	t2 = *(const NODE *const *) p2;
 
 	/* and we want to compare the element values they refer to */
-	val1 = idx1->hvalue;
-	val2 = idx2->hvalue;
+	n1 = t1->ahvalue;
+	n2 = t2->ahvalue;
 
-	ret = sort_cmp_nodes(val1, val2);
+	if (n1->type == Node_var_array && n2->type == Node_val)
+		return 1;	/* n1 is more */
+	if (n1->type == Node_val && n2->type == Node_var_array)
+		return -1;	/* n1 is less */
 
-	/* disambiguate ties to force a deterministic order */
-	if (ret == 0)
-		ret = sort_up_index_string(p1, p2);
-	return ret;
+ 	if (n1->type == Node_var_array && n2->type == Node_var_array) {
+		/* sub-array names contain respective indices, effectively resulting
+		 * in an index-based (in parent array) ordering.
+		 */
+		return strcmp(n1->vname, n2->vname);
+	}
+
+	/* n1 and n2 both have string values; See sort_force_value_string(). */
+	return cmp_string(n1, n2);
 }
 
-/* sort_down_value --- descending value maybe-nums */
+
+/* sort_down_value_string --- descending value string */
 
 static int
-sort_down_value(const void *p1, const void *p2)
+sort_down_value_string(const void *p1, const void *p2)
 {
-	return -sort_up_value(p1, p2);
+	return -sort_up_value_string(p1, p2);
 }
 
-/* sort_maybe_numeric_index --- special handling for sorting indices as numbers */
+/* sort_up_value_number --- qsort comparison function; ascending value number */
 
-void
-sort_maybe_numeric_index(qsort_compfunc func, NODE **list,
-			 size_t num_elems, int setup)
+static int
+sort_up_value_number(const void *p1, const void *p2)
 {
-	static const NODE empty_node;
+	const NODE *t1, *t2;
+	NODE *n1, *n2;
+	int ret;
 
-	size_t i;
-	NODE temp_node;
-	struct sort_num *fake_node;
+	/* we're passed a pair of index (array subscript) nodes */
+	t1 = *(const NODE *const *) p1;
+	t2 = *(const NODE *const *) p2;
 
-	/* if we're not sorting indices by numeric value, do nothing */
-	if (func != sort_up_index_number && func != sort_down_index_number)
-		return;
+	/* and we want to compare the element values they refer to */
+	n1 = t1->ahvalue;
+	n2 = t2->ahvalue;
+
+	if (n1->type == Node_var_array && n2->type == Node_val)
+		return 1;	/* n1 is more */
+	if (n1->type == Node_val && n2->type == Node_var_array)
+		return -1;	/* n1 is less */
+	if (n1->type == Node_var_array && n2->type == Node_var_array) {
+		/* sub-array names contain respective indices, effectively resulting
+		 * in an index-based (in parent array) ordering.
+		 */
 
-	/*
-	 * Insert a new struct in front of each index, so there's somewhere
-	 * to store its numeric value after that's calculated once, avoiding
-	 * having the qsort callback recalculate it for every comparison.
-	 * Our caller will call us back to destroy this intermediate array
-	 * as soon as qsort() finishes; the rest of gawk won't see it.
-	 * We could use--or perhaps mis-use--gawk nodes here, but they'd tie
-	 * up quite a bit more memory during the sort.
-	 */
-	if (setup) {
-		temp_node = empty_node;	/* init union fields to 0,NULL */
-		for (i = 0; i < num_elems; ++i) {
-			emalloc(fake_node, struct sort_num *,
-				sizeof (struct sort_num), "sort numeric index");
-			/* populate enough fields to satisfy force_number();
-			   we don't need to maintain reference counts */
-			temp_node.type = Node_val;
-			temp_node.stptr = list[i]->ahname_str;
-			temp_node.stlen = list[i]->ahname_len;
-			temp_node.flags = PERM | STRING | MAYBE_NUM;
-			/* insert the extra data in front of our input node */
-			fake_node->sort_numbr = force_number(&temp_node);
-			fake_node->sort_index = list[i];
-			list[i] = (NODE *) fake_node;
-		}
-	} else {	/* post sort cleanup */
-		for (i = 0; i < num_elems; ++i) {
-			/* undo the setup manipulations */
-			fake_node = (struct sort_num *) list[i];
-			list[i] = fake_node->sort_index;
-			efree((void *) fake_node);
-		}
+		return strcmp(n1->vname, n2->vname);
 	}
-}
 
-/* sort_match --- compare leading part of a string against a sort option */
+	/* n1 and n2 both Node_val, and force_number'ed */
+	if (n1->numbr < n2->numbr)
+		ret = -1;
+	else
+		ret = (n1->numbr > n2->numbr);
 
-static size_t
-sort_match(const char *str, size_t slen, const char *target)
-{
-	const char *endofword;
+	if (ret != 0)
+		return ret;
 
-	/*
-	 * STR comes from the user and is SLEN characters long;
-	 * if it has spaces then we care about the subset up until
-	 * its first space rather than the full length.
-	 * We check whether STR is the same as--or proper prefix
-	 * of--TARGET.
-	 * Accept "by-foo" or "as-foo" as a synonym match for "foo".
+	/* break a tie using string comparison. First, make sure both
+	 * n1 and n2 have string values.
 	 */
-	endofword = strchr(str, ' ');
-	if (endofword != NULL)
-		slen = (size_t) (endofword - str);
-	/* try for exact match */
-	if (slen > 0 && strncasecmp(str, target, slen) == 0)
-		return slen;
-	/* ingore "by-" or "as-" prefix and try again */
-	if (slen > 3
-	    && (strncasecmp(str, "by-", 3) == 0
-		|| strncasecmp(str, "as-", 3) == 0)
-	    && strncasecmp(str + 3, target, slen - 3) == 0)
-		return slen;
-	/* no match */
-	return 0;
+	(void) force_string(n1);
+	(void) force_string(n2);
+	return cmp_string(n1, n2);
+}
+
+
+/* sort_down_value_string --- descending value number */
+
+static int
+sort_down_value_number(const void *p1, const void *p2)
+{
+	return -sort_up_value_number(p1, p2);
 }
 
+
 /*
- * sort_selection --- parse user-specified sort ordering
- * ("ascending index" or "value number" and so forth);
- * returns a qsort comparison function
+ * sort_selection --- parse user-specified sort specification;
+ * returns an index into sort_funcs table located in assoc_list().
  */
 
-static qsort_compfunc
-sort_selection(NODE *r, const char *warn_arg, int default_by_value)
+static int
+sort_selection(NODE *sort_str, SORT_CTXT sort_ctxt)
 {
+	/* first 4 bits used to calculate index into sort_funcs table in assoc_list(),
+	 * next 4 bits for grouping individual components. Note that "Unsorted"
+	 * belongs to all the groups.
+	 */
+
 	enum sort_bits {
-		unrecognized =    0,
-		Unsorted     = 0x01,
-		Ascending    = 0x02,
-		Descending   = 0x04,
-		by_Index     = 0x08,
-		by_Value     = 0x10,
-		as_String    = 0x20,
-		as_Number    = 0x40,
-		as_Inserted  = 0x80,
-		allbitsused  = 0xff
+		Unrecognized = 0xFF,	/* not part of a sort phrase */
+		Unsorted     = 0xF8,
+		Ascending    = 0x40,
+		Descending   = 0x44,
+		by_Index     = 0x20,
+		by_Value     = 0x22,
+		as_String    = 0x10,
+		as_Number    = 0x11
 	};
+
+#define INDEX_MASK	0x0F
+#define GROUP_MASK	0xF0
+
 	/*
 	 * The documented values are singular, but it's trivial to accept
 	 * "index numbers" and "descending values" since we're using a
@@ -1644,214 +1404,231 @@ sort_selection(NODE *r, const char *warn_arg, int default_by_value)
 	 */
 	static const struct sort_keys {
 		const char *const keyword;
+		const size_t keyword_len;
 		enum  sort_bits keybit;
 	} sorts[] = {
-		{ "unsorted",   Unsorted    },	/* not part of a three-part key */
-		{ "ascending",  Ascending   },	/* ascending vs descending */
-		{ "descending", Descending  },
-		{ "indexes",    by_Index    },	/* by_Index vs by_Number */
-		{ "indices",    by_Index    },	/* synonym for plural match */
-		{ "values",     by_Value    },
-		{ "strings",    as_String   },	/* as_String vs as_Number */
-		{ "numbers",    as_Number   },
-		{ "numeric",    as_Number   },	/* synonym; singular only */
-		{ "inserted",   as_Inserted },	/* not part of a three-part key */
-		{ "insertion",  as_Inserted },	/* another synonym */
+		{ "unsorted",   8, Unsorted },
+		{ "ascending",  9, Ascending },	/* ascending vs descending */
+		{ "descending", 10, Descending  },
+		{ "indexes", 7, by_Index  },	/* by_Index vs by_Value */
+		{ "indices", 7, by_Index  },	/* synonym for plural match */
+		{ "values", 6,  by_Value  },
+		{ "strings", 7, as_String },	/* as_String vs as_Number */
+		{ "numbers", 7, as_Number },
+		{ "numeric", 7, as_Number },	/* synonym; singular only */
 	};
+	static int num_keys = sizeof(sorts) / sizeof(sorts[0]);
+	int num_words, i;
+	char *word, *s;
+	size_t word_len;
+	enum sort_bits allparts, bval;
+
+	if (sort_str == NULL) {
+		assert(sort_ctxt == SORTED_IN);
+		return (Unsorted & INDEX_MASK);		/* no sorting */
+	}
 
-	static short warned_unrecognized = FALSE;
-	static enum sort_bits prev_invalid = unrecognized;
+	(void) force_string(sort_str);
+	sort_str->stptr[sort_str->stlen] = '\0';	/* safety */
 
-	char *s;
-	size_t l, matchlen;
-	unsigned i;
-	const char *errfmt;
-	enum sort_bits allparts;
-	qsort_compfunc sort_func, default_sort_func;
-
-	/* deduce our caller from the args provided */
-	if (warn_arg != NULL)		/* for-in statement */
-		default_sort_func = (qsort_compfunc) 0;	/* unsorted */
-	else if (default_by_value)	/* asort() */
-		default_sort_func = sort_up_value;
-	else				/* asorti() */
-		default_sort_func = ! IGNORECASE ? sort_up_index_string
-						: sort_up_index_ignrcase;
-
-	r = force_string(r);
-	s = r->stptr;
-	l = r->stlen;
-
-	/* treat empty sort-order string as request for default */
-	if (l == 0)
-		return default_sort_func;
-
-	/* no ordering specification yet */
-	allparts = unrecognized;
-	/*
-	 * Scan through S until length L is exhausted, checking its
-	 * starting word against each possible ordering choice and
-	 * then skipping past that to get to the next word.  If no
-	 * choice matches, an error has been detected.  Duplicate
-	 * words are accepted; contradictory ones get noticed and
-	 * rejected after the string parsing has completed.
+	/* Initialize with the context-sensitive defaults; Note that group-bits aren't
+	 * copied, doing so will prevent the user from explicitely specifying the defaults.
 	 */
-	while (l > 0) {
-		matchlen = 0;	/* lint suppression */
-		for (i = 0; i < (sizeof sorts / sizeof *sorts); ++i) {
-			matchlen = sort_match(s, l, sorts[i].keyword);
-			if (matchlen > 0) {
-				allparts |= sorts[i].keybit;
+ 
+	if (sort_ctxt == ASORT)
+		allparts = (Ascending|by_Value|as_String) & INDEX_MASK;
+	else
+		allparts = (Ascending|by_Index|as_String) & INDEX_MASK;
+
+	num_words = 0;
+
+	for (s = sort_str->stptr; *s != '\0'; ) {
+		/* skip leading spaces */
+		while (*s == ' ')
+			s++;
+		if (*s == '\0')
+			break;
+		word = s;
+		/* find the end of the word */
+		while (*s && *s != ' ')
+			s++;
+		word_len = (size_t) (s - word);
+
+		if (++num_words > 3)	/* too many words in phrase */
+			goto un_recognized;
+
+		bval = Unrecognized;
+		for (i = 0; i < num_keys; i++) {
+			if (word_len <= sorts[i].keyword_len
+       		 	      && strncasecmp(sorts[i].keyword, word, word_len) == 0) {
+				bval = sorts[i].keybit;
 				break;
 			}
 		}
-		if (matchlen == 0) {
-			/* failed to match any possible component */
-			allparts = unrecognized;
-			break;
-		}
-		/* move past the part just handled */
-		s += matchlen, l -= matchlen;
-		/* (l > 0) here would accept a trailing space; (l > 1) won't */
-		if (l > 1 && *s == ' ')
-			++s, --l;
-	} /* while (l > 0) */
+		  		
+		if (bval == Unrecognized         /* invalid word in phrase */
+			||	(sort_ctxt == ASORT
+					&& (bval == by_Index || bval == Unsorted)
+				)                        /* "index" used in phrase for asort etc. */
+			||	(sort_ctxt == ASORTI
+					&& (bval == by_Value || bval == Unsorted)
+				)                        /* "value" used in phrase for asorti etc. */
+			||	((allparts & bval) & GROUP_MASK
+				)                        /* invalid grouping of words e.g. "str num" */
+		)
+			goto un_recognized;
+
+		allparts |= bval;
+	}
 
-	/*
-	 * If we have one or two parts of a three part key, fill in
-	 * whichever parts are missing with default values.  For key
-	 * type the default is specified by our caller, for direction
-	 * it is Ascending, and for comparison mode it is as_String
-	 * but only when sorting by_Index.  When sorting by_Value,
-	 * comparison mode is out of the user's control and we don't
-	 * need or want to fill in either as_String or as_Number.
-	 */
-	if ((allparts & (Ascending|Descending|as_String|as_Number)) != 0
-	    && (allparts & (Unsorted|by_Index|by_Value|as_Inserted)) == 0)
-		allparts |= default_by_value ? by_Value : by_Index;
-	if ((allparts & (by_Index|by_Value|as_String|as_Number)) != 0
-	    && (allparts & (Unsorted|Ascending|Descending|as_Inserted)) == 0)
-		allparts |= Ascending;
-	/* by_Value is handled differently from by_Index */
-	if ((allparts & (Ascending|Descending|by_Index)) != 0
-	    && (allparts & (Unsorted|by_Value|as_String|as_Number|as_Inserted)) == 0)
-		allparts |= as_String;
+	/* num_words <= 3 */
+	return (allparts & INDEX_MASK);
 
-	/*
-	 * ALLPARTS is a bit mask of all the user's specified ordering
-	 * choices; partial specifications have been extended into full
-	 * three part key.  Valid combinations yield a qsort(3) comparison
-	 * routine result.  Invalid ones are rejected and yield default
-	 * sort, with an error message for asort()/asorti(), a warning
-	 * for for-in statement if this is a different invalid combination
-	 * than the most recently warned about one, or a one-time warning
-	 * if this is an unrecognized ordering specification and we're
-	 * running in lint checking mode.
-	 */
-	sort_func = default_sort_func;
-	errfmt = NULL;
-	switch (allparts) {
-	case Unsorted:
-		sort_func = (qsort_compfunc) 0;
-		break;
-	case Ascending|by_Index|as_String:
-		sort_func = ! IGNORECASE ? sort_up_index_string
-					: sort_up_index_ignrcase;
-		break;
-	case Descending|by_Index|as_String:
-		sort_func = ! IGNORECASE ? sort_down_index_string
-					: sort_down_index_ignrcase;
-		break;
-	case Ascending|by_Index|as_Number:
-		sort_func = sort_up_index_number;
-		break;
-	case Descending|by_Index|as_Number:
-		sort_func = sort_down_index_number;
-		break;
-	case Ascending|by_Value:
-		sort_func = sort_up_value;
-		break;
-	case Descending|by_Value:
-		sort_func = sort_down_value;
-		break;
-	/* for value sorts, distinction between string and number is not
-	   allowed; they're always ordered by basic awk MAYBE_NUM comparison */
-	case Ascending|by_Value|as_String:
-	case Descending|by_Value|as_String:
-		if (errfmt == NULL)
-			errfmt = _("%s: sorting by value can't be forced to use string comparison");
-		/* FALL THROUGH */
-	case Ascending|by_Value|as_Number:
-	case Descending|by_Value|as_Number:
-		if (errfmt == NULL)
-			errfmt = _("%s: sorting by value can't be forced to use numeric comparison");
-		/* FALL THROUGH */
-	/* gawk doesn't maintain the data necessary to sort by
-	   order of insertion */
-	case as_Inserted:
-		if (errfmt == NULL)
-			errfmt = _("%s: sort ordering \"as-inserted\" is not implemented");
-		/* FALL THROUGH */
+un_recognized:
+	switch (sort_ctxt) {
+	case ASORT:
+		fatal(_("asort: invalid sort specification `%s'"), sort_str->stptr);
+	case ASORTI:
+		fatal(_("asorti: invalid sort specification `%s'"), sort_str->stptr);
+
+	case SORTED_IN:
+		/* fall through */
 	default:
-		/* sort_func still has default value */
-		if (errfmt == NULL)
-			errfmt = (allparts == unrecognized)
-				? _("%s: sort specification is not recognized")
-				: _("%s: sort specification is invalid");
-		if (warn_arg == NULL) {
-			/* asort()/asorti() bad sort specification argument */
-			error(errfmt, default_by_value ? "asort" : "asorti");
-		} else if (allparts != unrecognized) {
-			/* invalid combination of valid ordering components */
-			if (allparts != prev_invalid)
-				warning(errfmt, warn_arg);
-			prev_invalid = allparts;
-		} else if (do_lint) {
-			/* unrecognized ordering specification string */              
-			if (! warned_unrecognized)
-				lintwarn(errfmt, warn_arg);
-			warned_unrecognized = TRUE;
+		if (do_lint) {
+			static NODE *warned_str = NULL;
+
+			/* warning for each UNIQUE unrecognized sort_str specification */              
+			if (warned_str == NULL || ! STREQ(warned_str->stptr, sort_str->stptr)) {
+				lintwarn(_("PROCINFO[\"sorted_in\"]: invalid sort specification `%s'"),
+						sort_str->stptr);
+				unref(warned_str);	/* unref(NULL) is OK */ 
+				warned_str = dupnode(sort_str);
+			}
 		}
 		break;
 	}
-	return sort_func;
+	return (Unsorted & INDEX_MASK);
+
+#undef INDEX_MASK
+#undef GROUP_MASK
 }
 
-/*
- * sorted_in --- fetch value of PROCINFO["sorted_in"] and use sort_selection()
- * to parse it; controls whether and how traversal of ``for (index in array)''
- * should be sorted; returns a qsort comparison function
- */
+/* sort_force_index_number -- pre-process list items for sorting indices as numbers */
 
-qsort_compfunc
-sorted_in(void)		/* called from r_interpret(eval.c) */
+static void
+sort_force_index_number(NODE **list, size_t num_elems)
 {
-	static NODE *sorted_str = NULL;
-	static short warned_extension = FALSE;
-
+	size_t i;
 	NODE *r;
+	static NODE temp_node;
+
+	for (i = 0; i < num_elems; i++) {
+		r = list[i];
+
+		/* Kludge: the use of NUMCUR flag for a Node_ahash is only to detect
+		 * multiple conversion attempts. Also, the flag value of a Node_ahash
+		 * isn't used anywhere else in the gawk source. 
+		 * Maybe a new flag other than NUMCUR should be used ?
+		 */
+
+		if ((r->flags & NUMCUR) != 0)	/* once in a lifetime is more than enough */
+			continue;
+		temp_node.type = Node_val;
+		temp_node.stptr = r->ahname_str;
+		temp_node.stlen = r->ahname_len;
+		temp_node.flags = 0;	/* only interested in the return value of r_force_number */
+		r->ahname_num = r_force_number(&temp_node);
+		r->flags |= NUMCUR;
+	}
+}
 
-	/* if there's no PROCINFO[], there's no ["sorted_in"], so no sorting */
-	if (PROCINFO_node == NULL)
-		return (qsort_compfunc) 0;
+/* sort_force_value_number -- pre-process list items for sorting values as numbers */
 
-	if (sorted_str == NULL)		/* do this once */
-		sorted_str = make_string("sorted_in", 9);
+static void
+sort_force_value_number(NODE **list, size_t num_elems)
+{
+	size_t i;
+	NODE *r, *val;
+
+	for (i = 0; i < num_elems; i++) {
+		r = list[i];
+		val = r->ahvalue;
+		if (val->type == Node_val)
+			(void) force_number(val);
+	}
+}
 
-	r = (NODE *) NULL;
-	if (in_array(PROCINFO_node, sorted_str))
-		r = *assoc_lookup(PROCINFO_node, sorted_str, TRUE);
-	/* if there's no PROCINFO["sorted_in"], there's no sorting */
-	if (r == NULL)
-		return (qsort_compfunc) 0;
+/* sort_force_value_string -- pre-process list items for sorting values as strings */
+
+static void
+sort_force_value_string(NODE **list, size_t num_elems)
+{
+	size_t i;
+	NODE *r, *val;
 
-	/* we're going to make use of PROCINFO["sorted_in"] */
-	if (do_lint && ! warned_extension) {
-		warned_extension = TRUE;
-		lintwarn(_("sorted array traversal is a gawk extension"));
+	for (i = 0; i < num_elems; i++) {
+		r = list[i];
+		val = r->ahvalue;
+		if (val->type == Node_val)
+			r->ahvalue = force_string(val);
 	}
+}
+
+
+/* assoc_list -- construct, and optionally sort, a list of array elements */  
+
+NODE **
+assoc_list(NODE *array, NODE *sort_str, SORT_CTXT sort_ctxt)
+{
+	typedef void (*qsort_prefunc)(NODE **, size_t);
+	typedef int (*qsort_compfunc)(const void *, const void *);
+
+	static const struct qsort_funcs {
+		qsort_compfunc comp_func;
+		qsort_prefunc pre_func;		/* pre-processing of list items */
+	} sort_funcs[] = {
+		{ sort_up_index_string,		0                       },	/* ascending index string */
+		{ sort_up_index_number,		sort_force_index_number },	/* ascending index number */
+		{ sort_up_value_string,		sort_force_value_string },	/* ascending value string */
+		{ sort_up_value_number,		sort_force_value_number	},	/* ascending value number */
+		{ sort_down_index_string,	0                       },	/* descending index string */ 
+		{ sort_down_index_number,	sort_force_index_number },	/* descending index number */	
+		{ sort_down_value_string,	sort_force_value_string },	/* descending value string */ 
+		{ sort_down_value_number,	sort_force_value_number },	/* descending value number */
+		{ 0,	0 }                                            	/* unsorted */
+	};
+	int qi;
+	NODE **list;
+	NODE *r;
+	size_t num_elems, i, j;
+	qsort_compfunc cmp_func;
+	qsort_prefunc pre_func;
+
+	num_elems = array->table_size;
+	assert(num_elems > 0);
+
+	qi = sort_selection(sort_str, sort_ctxt);
+
+	/* allocate space for array; the extra space is used in for(i in a) opcode (eval.c) */
+	emalloc(list, NODE **, (num_elems + 1) * sizeof(NODE *), "assoc_list");
+
+	/* populate it */
+	for (i = j = 0; i < array->array_size; i++)
+		for (r = array->var_array[i]; r != NULL; r = r->ahnext)
+			list[j++] = ahash_dupnode(r);
+	list[num_elems] = NULL;
+
+	cmp_func = sort_funcs[qi].comp_func;
+	if (! cmp_func)	/* unsorted */	
+		return list;
+
+	/* special pre-processing of list items */
+	pre_func = sort_funcs[qi].pre_func;
+	if (pre_func)
+		pre_func(list, num_elems);
 
-	return sort_selection(r, "`PROCINFO[\"sorted_in\"]'", FALSE);
+	qsort(list, num_elems, sizeof(NODE *), cmp_func); /* shazzam! */
+	return list;
 }
author	Arnold D. Robbins <arnold@skeeve.com>	2011-04-18 10:23:54 +0300
committer	Arnold D. Robbins <arnold@skeeve.com>	2011-04-18 10:23:54 +0300
commit	bcb8bd6d6671a3400c4bfe3e34eb4d5d66050f32 (patch)
tree	41cf2343944c2836150422b1bafabdaa73ff32bb /array.c
parent	98eb78c2d05870952fe25bbe37b86df2017f42fc (diff)
download	egawk-bcb8bd6d6671a3400c4bfe3e34eb4d5d66050f32.tar.gz egawk-bcb8bd6d6671a3400c4bfe3e34eb4d5d66050f32.tar.bz2 egawk-bcb8bd6d6671a3400c4bfe3e34eb4d5d66050f32.zip