aboutsummaryrefslogtreecommitdiffstats
path: root/array.c
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2011-03-29 20:51:30 +0200
committerArnold D. Robbins <arnold@skeeve.com>2011-03-29 20:51:30 +0200
commit2400cc5143383a881356a9f55e93b60037d851e5 (patch)
tree3ba75f8b1225bf50e48b4f6c381e7772f1ab6c5b /array.c
parent4fe569fb78dd1b25822c16c9cac515a0fc6702a4 (diff)
downloadegawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.gz
egawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.bz2
egawk-2400cc5143383a881356a9f55e93b60037d851e5.zip
Revise array sorting for PROCINFO["sorted_in"].
Diffstat (limited to 'array.c')
-rw-r--r--array.c545
1 files changed, 545 insertions, 0 deletions
diff --git a/array.c b/array.c
index b821b5e4..0b03397c 100644
--- a/array.c
+++ b/array.c
@@ -54,6 +54,24 @@ static unsigned long awk_hash(const char *s, size_t len, unsigned long hsize, si
unsigned long (*hash)(const char *s, size_t len, unsigned long hsize, size_t *code) = awk_hash;
+/* used by sort_maybe_numeric_index() and sort_up_index_number() */
+struct sort_num { AWKNUM sort_numbr; NODE *sort_index; };
+
+/* sorting support code */
+static qsort_compfunc sort_selection(NODE *, const char *, int);
+static size_t sort_match(const char *, size_t, const char *);
+static int sort_ignorecase(const char *, size_t, const char *, size_t);
+
+/* qsort callback routines */
+static int sort_up_index_string(const void *, const void *);
+static int sort_down_index_string(const void *, const void *);
+static int sort_up_index_ignrcase(const void *, const void *);
+static int sort_down_index_ignrcase(const void *, const void *);
+static int sort_up_index_number(const void *, const void *);
+static int sort_down_index_number(const void *, const void *);
+static int sort_up_value(const void *, const void *);
+static int sort_down_value(const void *, const void *);
+
/* array_init --- possibly temporary function for experimentation purposes */
void
@@ -1273,6 +1291,533 @@ do_asorti(int nargs)
return asort_actual(nargs, INDEX);
}
+
+/* comp_func --- array index comparison function for qsort, used in debug.c */
+
+int
+comp_func(const void *p1, const void *p2)
+{
+ return sort_up_index_string(p1, p2);
+}
+
+/* sort_ignorecase --- case insensitive string comparison from cmp_nodes() */
+
+static int
+sort_ignorecase(const char *s1, size_t l1, const char *s2, size_t l2)
+{
+ size_t l;
+ int ret;
+
+ l = (l1 < l2) ? l1 : l2;
+#ifdef MBS_SUPPORT
+ if (gawk_mb_cur_max > 1) {
+ ret = strncasecmpmbs((const unsigned char *) s1,
+ (const unsigned char *) s2, l);
+ } else
+#endif
+ for (ret = 0; l-- > 0 && ret == 0; s1++, s2++)
+ ret = casetable[*(unsigned char *) s1]
+ - casetable[*(unsigned char *) s2];
+ if (ret == 0 && l1 != l2)
+ ret = (l1 < l2) ? -1 : 1;
+ return ret;
+}
+
+/*
+ * sort_up_index_string --- qsort comparison function; ascending index strings;
+ * index strings are distinct within an array, so no comparisons will yield
+ * equal and warrant disambiguation
+ */
+
+static int
+sort_up_index_string(const void *p1, const void *p2)
+{
+ const NODE *t1, *t2;
+ const char *str1, *str2;
+ size_t len1, len2;
+ int ret;
+
+ /* Array indexes are strings and distinct, never equal */
+ t1 = *((const NODE *const *) p1);
+ t2 = *((const NODE *const *) p2);
+
+ str1 = t1->ahname_str;
+ len1 = t1->ahname_len;
+ str2 = t2->ahname_str;
+ len2 = t2->ahname_len;
+
+ ret = memcmp(str1, str2, (len1 < len2) ? len1 : len2);
+ /*
+ * if they compared equal but their lengths differ, the
+ * shorter one is a prefix of the longer and compares as less
+ */
+ if (ret == 0 && len1 != len2)
+ ret = (len1 < len2) ? -1 : 1;
+
+ /* indices are unique within an array, so result should never be 0 */
+ assert(ret != 0);
+ return ret;
+}
+
+/* sort_down_index_string --- descending index strings */
+
+static int
+sort_down_index_string(const void *p1, const void *p2)
+{
+ /*
+ * Negation versus transposed arguments: when all keys are
+ * distinct, as with array indices here, either method will
+ * transform an ascending sort into a descending one. But if
+ * there are equal keys--such as when IGNORECASE is honored--
+ * that get disambiguated into a determisitc order, negation
+ * will reverse those but transposed arguments would retain
+ * their relative order within the rest of the reversed sort.
+ */
+ return -sort_up_index_string(p1, p2);
+}
+
+/*
+ * sort_up_index_ignrcase --- ascending index string, case insensitive;
+ * case insensitive comparison can cause distinct index strings to compare
+ * equal, so disambiguation in necessary
+ */
+
+static int
+sort_up_index_ignrcase(const void *p1, const void *p2)
+{
+ const NODE *t1, *t2;
+ int ret;
+
+ t1 = *((const NODE *const *) p1);
+ t2 = *((const NODE *const *) p2);
+
+ ret = sort_ignorecase(t1->ahname_str, t1->ahname_len,
+ t2->ahname_str, t2->ahname_len);
+
+ /*
+ * if case insensitive result is "they're the same",
+ * use case sensitive comparison to force distinct order
+ */
+ if (ret == 0)
+ ret = sort_up_index_string(p1, p2);
+ return ret;
+}
+
+/* sort_down_index_ignrcase --- descending index strings, case insensitive */
+
+static int
+sort_down_index_ignrcase(const void *p1, const void *p2)
+{
+ return -sort_up_index_ignrcase(p1, p2);
+}
+
+/*
+ * sort_up_index_number --- qsort comparison function; ascending index numbers;
+ * the args have been built for us by sort_maybe_numeric_index() and point
+ * to custom structs rather than to gawk nodes
+ */
+
+static int
+sort_up_index_number(const void *p1, const void *p2)
+{
+ const struct sort_num *n1, *n2;
+ int ret;
+
+ n1 = *((const struct sort_num *const *) p1);
+ n2 = *((const struct sort_num *const *) p2);
+
+ if (n1->sort_numbr < n2->sort_numbr)
+ ret = -1;
+ else
+ ret = (n1->sort_numbr > n2->sort_numbr);
+
+ /* break a tie with the index string itself */
+ if (ret == 0)
+ return sort_up_index_string((const void *) n1->sort_index,
+ (const void *) n2->sort_index);
+ return ret;
+}
+
+/* sort_down_index_number --- descending index numbers */
+
+static int
+sort_down_index_number(const void *p1, const void *p2)
+{
+ return -sort_up_index_number(p1, p2);
+}
+
+/*
+ * sort_up_value --- qsort comparison function; ascending values;
+ * standard awk comparison rules apply, so might be by number or might
+ * be by string depending upon the values involved; if a value happens
+ * to be a sub-array, it will compare greater than any scalar (last in
+ * ascending order) and equal with any other sub-array (disambiguated
+ * by index string, the same as with equal scalar values)
+ */
+
+static int
+sort_up_value(const void *p1, const void *p2)
+{
+ const NODE *idx1, *idx2;
+ NODE *val1, *val2;
+ int ret;
+
+ /* we're passed a pair of index (array subscript) nodes */
+ idx1 = *(const NODE *const *) p1;
+ idx2 = *(const NODE *const *) p2;
+ if (idx1->type != Node_ahash || idx2->type != Node_ahash)
+ cant_happen();
+
+ /* and we want to compare the element values they refer to */
+ val1 = idx1->hvalue;
+ val2 = idx2->hvalue;
+ if (val1->type == Node_var_array) {
+ if (val2->type == Node_var_array)
+ ret = 0; /* both sub-arrays, hence tie */
+ else
+ ret = 1; /* val1 (sub-array) > val2 (scalar) */
+ } else {
+ if (val2->type == Node_var_array)
+ ret = -1; /* val1 (scalar) < val2 (sub-array) */
+ else
+ ret = cmp_nodes(val1, val2); /* both scalars */
+ }
+
+ /* disambiguate ties to force a deterministic order */
+ if (ret == 0)
+ ret = sort_up_index_string(p1, p2);
+ return ret;
+}
+
+/* sort_down_value --- descending value maybe-nums */
+
+static int
+sort_down_value(const void *p1, const void *p2)
+{
+ return -sort_up_value(p1, p2);
+}
+
+/* sort_maybe_numeric_index --- special handling for sorting indices as numbers */
+
+void
+sort_maybe_numeric_index(qsort_compfunc func, NODE **list,
+ size_t num_elems, int setup)
+{
+ static const NODE empty_node;
+
+ size_t i;
+ NODE temp_node;
+ struct sort_num *fake_node;
+
+ /* if we're not sorting indices by numeric value, do nothing */
+ if (func != sort_up_index_number && func != sort_down_index_number)
+ return;
+
+ /*
+ * Insert a new struct in front of each index, so there's somewhere
+ * to store its numeric value after that's calculated once, avoiding
+ * having the qsort callback recalculate it for every comparison.
+ * Our caller will call us back to destroy this intermediate array
+ * as soon as qsort() finishes; the rest of gawk won't see it.
+ * We could use--or perhaps mis-use--gawk nodes here, but they'd tie
+ * up quite a bit more memory during the sort.
+ */
+ if (setup) {
+ temp_node = empty_node; /* init union fields to 0,NULL */
+ for (i = 0; i < num_elems; ++i) {
+ emalloc(fake_node, struct sort_num *,
+ sizeof (struct sort_num), "sort numeric index");
+ /* populate enough fields to satisfy force_number();
+ we don't need to maintain reference counts */
+ temp_node.type = Node_val;
+ temp_node.stptr = list[i]->ahname_str;
+ temp_node.stlen = list[i]->ahname_len;
+ temp_node.flags = PERM | STRING | MAYBE_NUM;
+ /* insert the extra data in front of our input node */
+ fake_node->sort_numbr = force_number(&temp_node);
+ fake_node->sort_index = list[i];
+ list[i] = (NODE *) fake_node;
+ }
+ } else { /* post sort cleanup */
+ for (i = 0; i < num_elems; ++i) {
+ /* undo the setup manipulations */
+ fake_node = (struct sort_num *) list[i];
+ list[i] = fake_node->sort_index;
+ efree((void *) fake_node);
+ }
+ }
+}
+
+/* sort_match --- compare leading part of a string against a sort option */
+
+static size_t
+sort_match(const char *str, size_t slen, const char *target)
+{
+ const char *endofword;
+
+ /*
+ * STR comes from the user and is SLEN characters long;
+ * if it has spaces then we care about the subset up until
+ * its first space rather than the full length.
+ * We check whether STR is the same as--or proper prefix
+ * of--TARGET.
+ * Accept "by-foo" or "as-foo" as a synonym match for "foo".
+ */
+ endofword = strchr(str, ' ');
+ if (endofword != NULL)
+ slen = (size_t) (endofword - str);
+ /* try for exact match */
+ if (slen > 0 && strncasecmp(str, target, slen) == 0)
+ return slen;
+ /* ingore "by-" or "as-" prefix and try again */
+ if (slen > 3
+ && (strncasecmp(str, "by-", 3) == 0
+ || strncasecmp(str, "as-", 3) == 0)
+ && strncasecmp(str + 3, target, slen - 3) == 0)
+ return slen;
+ /* no match */
+ return 0;
+}
+
+/*
+ * sort_selection --- parse user-specified sort ordering
+ * ("ascending index" or "value number" and so forth);
+ * returns a qsort comparison function
+ */
+
+static qsort_compfunc
+sort_selection(NODE *r, const char *warn_arg, int default_by_value)
+{
+ enum sort_bits {
+ unrecognized = 0,
+ Unsorted = 0x01,
+ Ascending = 0x02,
+ Descending = 0x04,
+ by_Index = 0x08,
+ by_Value = 0x10,
+ as_String = 0x20,
+ as_Number = 0x40,
+ as_Inserted = 0x80,
+ allbitsused = 0xff
+ };
+ /*
+ * The documented values are singular, but it's trivial to accept
+ * "index numbers" and "descending values" since we're using a
+ * prefix match. Latin plural of index is the only complication.
+ */
+ static const struct sort_keys {
+ const char *const keyword;
+ enum sort_bits keybit;
+ } sorts[] = {
+ { "unsorted", Unsorted }, /* not part of a three-part key */
+ { "ascending", Ascending }, /* ascending vs descending */
+ { "descending", Descending },
+ { "indexes", by_Index }, /* by_Index vs by_Number */
+ { "indices", by_Index }, /* synonym for plural match */
+ { "values", by_Value },
+ { "strings", as_String }, /* as_String vs as_Number */
+ { "numbers", as_Number },
+ { "numeric", as_Number }, /* synonym; singular only */
+ { "inserted", as_Inserted }, /* not part of a three-part key */
+ { "insertion", as_Inserted }, /* another synonym */
+ };
+
+ static short warned_unrecognized = FALSE;
+ static enum sort_bits prev_invalid = unrecognized;
+
+ char *s;
+ size_t l, matchlen;
+ unsigned i;
+ const char *errfmt;
+ enum sort_bits allparts;
+ qsort_compfunc sort_func, default_sort_func;
+
+ /* deduce our caller from the args provided */
+ if (warn_arg != NULL) /* for-in statement */
+ default_sort_func = (qsort_compfunc) 0; /* unsorted */
+ else if (default_by_value) /* asort() */
+ default_sort_func = sort_up_value;
+ else /* asorti() */
+ default_sort_func = ! IGNORECASE ? sort_up_index_string
+ : sort_up_index_ignrcase;
+
+ r = force_string(r);
+ s = r->stptr;
+ l = r->stlen;
+
+ /* treat empty sort-order string as request for default */
+ if (l == 0)
+ return default_sort_func;
+
+ /* no ordering specification yet */
+ allparts = unrecognized;
+ /*
+ * Scan through S until length L is exhausted, checking its
+ * starting word against each possible ordering choice and
+ * then skipping past that to get to the next word. If no
+ * choice matches, an error has been detected. Duplicate
+ * words are accepted; contradictory ones get noticed and
+ * rejected after the string parsing has completed.
+ */
+ while (l > 0) {
+ matchlen = 0; /* lint suppression */
+ for (i = 0; i < (sizeof sorts / sizeof *sorts); ++i) {
+ matchlen = sort_match(s, l, sorts[i].keyword);
+ if (matchlen > 0) {
+ allparts |= sorts[i].keybit;
+ break;
+ }
+ }
+ if (matchlen == 0) {
+ /* failed to match any possible component */
+ allparts = unrecognized;
+ break;
+ }
+ /* move past the part just handled */
+ s += matchlen, l -= matchlen;
+ /* (l > 0) here would accept a trailing space; (l > 1) won't */
+ if (l > 1 && *s == ' ')
+ ++s, --l;
+ } /* while (l > 0) */
+
+ /*
+ * If we have one or two parts of a three part key, fill in
+ * whichever parts are missing with default values. For key
+ * type the default is specified by our caller, for direction
+ * it is Ascending, and for comparison mode it is as_String
+ * but only when sorting by_Index. When sorting by_Value,
+ * comparison mode is out of the user's control and we don't
+ * need or want to fill in either as_String or as_Number.
+ */
+ if ((allparts & (Ascending|Descending|as_String|as_Number)) != 0
+ && (allparts & (Unsorted|by_Index|by_Value|as_Inserted)) == 0)
+ allparts |= default_by_value ? by_Value : by_Index;
+ if ((allparts & (by_Index|by_Value|as_String|as_Number)) != 0
+ && (allparts & (Unsorted|Ascending|Descending|as_Inserted)) == 0)
+ allparts |= Ascending;
+ /* by_Value is handled differently from by_Index */
+ if ((allparts & (Ascending|Descending|by_Index)) != 0
+ && (allparts & (Unsorted|by_Value|as_String|as_Number|as_Inserted)) == 0)
+ allparts |= as_String;
+
+ /*
+ * ALLPARTS is a bit mask of all the user's specified ordering
+ * choices; partial specifications have been extended into full
+ * three part key. Valid combinations yield a qsort(3) comparison
+ * routine result. Invalid ones are rejected and yield default
+ * sort, with an error message for asort()/asorti(), a warning
+ * for for-in statement if this is a different invalid combination
+ * than the most recently warned about one, or a one-time warning
+ * if this is an unrecognized ordering specification and we're
+ * running in lint checking mode.
+ */
+ sort_func = default_sort_func;
+ errfmt = NULL;
+ switch (allparts) {
+ case Unsorted:
+ sort_func = (qsort_compfunc) 0;
+ break;
+ case Ascending|by_Index|as_String:
+ sort_func = ! IGNORECASE ? sort_up_index_string
+ : sort_up_index_ignrcase;
+ break;
+ case Descending|by_Index|as_String:
+ sort_func = ! IGNORECASE ? sort_down_index_string
+ : sort_down_index_ignrcase;
+ break;
+ case Ascending|by_Index|as_Number:
+ sort_func = sort_up_index_number;
+ break;
+ case Descending|by_Index|as_Number:
+ sort_func = sort_down_index_number;
+ break;
+ case Ascending|by_Value:
+ sort_func = sort_up_value;
+ break;
+ case Descending|by_Value:
+ sort_func = sort_down_value;
+ break;
+ /* for value sorts, distinction between string and number is not
+ allowed; they're always ordered by basic awk MAYBE_NUM comparison */
+ case Ascending|by_Value|as_String:
+ case Descending|by_Value|as_String:
+ if (errfmt == NULL)
+ errfmt = _("%s: sorting by value can't be forced to use string comparison");
+ /* FALL THROUGH */
+ case Ascending|by_Value|as_Number:
+ case Descending|by_Value|as_Number:
+ if (errfmt == NULL)
+ errfmt = _("%s: sorting by value can't be forced to use numeric comparison");
+ /* FALL THROUGH */
+ /* gawk doesn't maintain the data necessary to sort by
+ order of insertion */
+ case as_Inserted:
+ if (errfmt == NULL)
+ errfmt = _("%s: sort ordering \"as-inserted\" is not implemented");
+ /* FALL THROUGH */
+ default:
+ /* sort_func still has default value */
+ if (errfmt == NULL)
+ errfmt = (allparts == unrecognized)
+ ? _("%s: sort specification is not recognized")
+ : _("%s: sort specification is invalid");
+ if (warn_arg == NULL) {
+ /* asort()/asorti() bad sort specification argument */
+ error(errfmt, default_by_value ? "asort" : "asorti");
+ } else if (allparts != unrecognized) {
+ /* invalid combination of valid ordering components */
+ if (allparts != prev_invalid)
+ warning(errfmt, warn_arg);
+ prev_invalid = allparts;
+ } else if (do_lint) {
+ /* unrecognized ordering specification string */
+ if (! warned_unrecognized)
+ lintwarn(errfmt, warn_arg);
+ warned_unrecognized = TRUE;
+ }
+ break;
+ }
+ return sort_func;
+}
+
+/*
+ * sorted_in --- fetch value of PROCINFO["sorted_in"] and use sort_selection()
+ * to parse it; controls whether and how traversal of ``for (index in array)''
+ * should be sorted; returns a qsort comparison function
+ */
+
+qsort_compfunc
+sorted_in(void) /* called from r_interpret(eval.c) */
+{
+ static NODE *sorted_str = NULL;
+ static short warned_extension = FALSE;
+
+ NODE *r;
+
+ /* if there's no PROCINFO[], there's no ["sorted_in"], so no sorting */
+ if (PROCINFO_node == NULL)
+ return (qsort_compfunc) 0;
+
+ if (sorted_str == NULL) /* do this once */
+ sorted_str = make_string("sorted_in", 9);
+
+ r = (NODE *) NULL;
+ if (in_array(PROCINFO_node, sorted_str))
+ r = *assoc_lookup(PROCINFO_node, sorted_str, TRUE);
+ /* if there's no PROCINFO["sorted_in"], there's no sorting */
+ if (r == NULL)
+ return (qsort_compfunc) 0;
+
+ /* we're going to make use of PROCINFO["sorted_in"] */
+ if (do_lint && ! warned_extension) {
+ warned_extension = TRUE;
+ lintwarn(_("sorted array traversal is a gawk extension"));
+ }
+
+ return sort_selection(r, "`PROCINFO[\"sorted_in\"]'", FALSE);
+}
+
+
/*
From bonzini@gnu.org Mon Oct 28 16:05:26 2002
Date: Mon, 28 Oct 2002 13:33:03 +0100