diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2011-03-29 20:51:30 +0200 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2011-03-29 20:51:30 +0200 |
commit | 2400cc5143383a881356a9f55e93b60037d851e5 (patch) | |
tree | 3ba75f8b1225bf50e48b4f6c381e7772f1ab6c5b /array.c | |
parent | 4fe569fb78dd1b25822c16c9cac515a0fc6702a4 (diff) | |
download | egawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.gz egawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.bz2 egawk-2400cc5143383a881356a9f55e93b60037d851e5.zip |
Revise array sorting for PROCINFO["sorted_in"].
Diffstat (limited to 'array.c')
-rw-r--r-- | array.c | 545 |
1 files changed, 545 insertions, 0 deletions
@@ -54,6 +54,24 @@ static unsigned long awk_hash(const char *s, size_t len, unsigned long hsize, si unsigned long (*hash)(const char *s, size_t len, unsigned long hsize, size_t *code) = awk_hash; +/* used by sort_maybe_numeric_index() and sort_up_index_number() */ +struct sort_num { AWKNUM sort_numbr; NODE *sort_index; }; + +/* sorting support code */ +static qsort_compfunc sort_selection(NODE *, const char *, int); +static size_t sort_match(const char *, size_t, const char *); +static int sort_ignorecase(const char *, size_t, const char *, size_t); + +/* qsort callback routines */ +static int sort_up_index_string(const void *, const void *); +static int sort_down_index_string(const void *, const void *); +static int sort_up_index_ignrcase(const void *, const void *); +static int sort_down_index_ignrcase(const void *, const void *); +static int sort_up_index_number(const void *, const void *); +static int sort_down_index_number(const void *, const void *); +static int sort_up_value(const void *, const void *); +static int sort_down_value(const void *, const void *); + /* array_init --- possibly temporary function for experimentation purposes */ void @@ -1273,6 +1291,533 @@ do_asorti(int nargs) return asort_actual(nargs, INDEX); } + +/* comp_func --- array index comparison function for qsort, used in debug.c */ + +int +comp_func(const void *p1, const void *p2) +{ + return sort_up_index_string(p1, p2); +} + +/* sort_ignorecase --- case insensitive string comparison from cmp_nodes() */ + +static int +sort_ignorecase(const char *s1, size_t l1, const char *s2, size_t l2) +{ + size_t l; + int ret; + + l = (l1 < l2) ? l1 : l2; +#ifdef MBS_SUPPORT + if (gawk_mb_cur_max > 1) { + ret = strncasecmpmbs((const unsigned char *) s1, + (const unsigned char *) s2, l); + } else +#endif + for (ret = 0; l-- > 0 && ret == 0; s1++, s2++) + ret = casetable[*(unsigned char *) s1] + - casetable[*(unsigned char *) s2]; + if (ret == 0 && l1 != l2) + ret = (l1 < l2) ? -1 : 1; + return ret; +} + +/* + * sort_up_index_string --- qsort comparison function; ascending index strings; + * index strings are distinct within an array, so no comparisons will yield + * equal and warrant disambiguation + */ + +static int +sort_up_index_string(const void *p1, const void *p2) +{ + const NODE *t1, *t2; + const char *str1, *str2; + size_t len1, len2; + int ret; + + /* Array indexes are strings and distinct, never equal */ + t1 = *((const NODE *const *) p1); + t2 = *((const NODE *const *) p2); + + str1 = t1->ahname_str; + len1 = t1->ahname_len; + str2 = t2->ahname_str; + len2 = t2->ahname_len; + + ret = memcmp(str1, str2, (len1 < len2) ? len1 : len2); + /* + * if they compared equal but their lengths differ, the + * shorter one is a prefix of the longer and compares as less + */ + if (ret == 0 && len1 != len2) + ret = (len1 < len2) ? -1 : 1; + + /* indices are unique within an array, so result should never be 0 */ + assert(ret != 0); + return ret; +} + +/* sort_down_index_string --- descending index strings */ + +static int +sort_down_index_string(const void *p1, const void *p2) +{ + /* + * Negation versus transposed arguments: when all keys are + * distinct, as with array indices here, either method will + * transform an ascending sort into a descending one. But if + * there are equal keys--such as when IGNORECASE is honored-- + * that get disambiguated into a determisitc order, negation + * will reverse those but transposed arguments would retain + * their relative order within the rest of the reversed sort. + */ + return -sort_up_index_string(p1, p2); +} + +/* + * sort_up_index_ignrcase --- ascending index string, case insensitive; + * case insensitive comparison can cause distinct index strings to compare + * equal, so disambiguation in necessary + */ + +static int +sort_up_index_ignrcase(const void *p1, const void *p2) +{ + const NODE *t1, *t2; + int ret; + + t1 = *((const NODE *const *) p1); + t2 = *((const NODE *const *) p2); + + ret = sort_ignorecase(t1->ahname_str, t1->ahname_len, + t2->ahname_str, t2->ahname_len); + + /* + * if case insensitive result is "they're the same", + * use case sensitive comparison to force distinct order + */ + if (ret == 0) + ret = sort_up_index_string(p1, p2); + return ret; +} + +/* sort_down_index_ignrcase --- descending index strings, case insensitive */ + +static int +sort_down_index_ignrcase(const void *p1, const void *p2) +{ + return -sort_up_index_ignrcase(p1, p2); +} + +/* + * sort_up_index_number --- qsort comparison function; ascending index numbers; + * the args have been built for us by sort_maybe_numeric_index() and point + * to custom structs rather than to gawk nodes + */ + +static int +sort_up_index_number(const void *p1, const void *p2) +{ + const struct sort_num *n1, *n2; + int ret; + + n1 = *((const struct sort_num *const *) p1); + n2 = *((const struct sort_num *const *) p2); + + if (n1->sort_numbr < n2->sort_numbr) + ret = -1; + else + ret = (n1->sort_numbr > n2->sort_numbr); + + /* break a tie with the index string itself */ + if (ret == 0) + return sort_up_index_string((const void *) n1->sort_index, + (const void *) n2->sort_index); + return ret; +} + +/* sort_down_index_number --- descending index numbers */ + +static int +sort_down_index_number(const void *p1, const void *p2) +{ + return -sort_up_index_number(p1, p2); +} + +/* + * sort_up_value --- qsort comparison function; ascending values; + * standard awk comparison rules apply, so might be by number or might + * be by string depending upon the values involved; if a value happens + * to be a sub-array, it will compare greater than any scalar (last in + * ascending order) and equal with any other sub-array (disambiguated + * by index string, the same as with equal scalar values) + */ + +static int +sort_up_value(const void *p1, const void *p2) +{ + const NODE *idx1, *idx2; + NODE *val1, *val2; + int ret; + + /* we're passed a pair of index (array subscript) nodes */ + idx1 = *(const NODE *const *) p1; + idx2 = *(const NODE *const *) p2; + if (idx1->type != Node_ahash || idx2->type != Node_ahash) + cant_happen(); + + /* and we want to compare the element values they refer to */ + val1 = idx1->hvalue; + val2 = idx2->hvalue; + if (val1->type == Node_var_array) { + if (val2->type == Node_var_array) + ret = 0; /* both sub-arrays, hence tie */ + else + ret = 1; /* val1 (sub-array) > val2 (scalar) */ + } else { + if (val2->type == Node_var_array) + ret = -1; /* val1 (scalar) < val2 (sub-array) */ + else + ret = cmp_nodes(val1, val2); /* both scalars */ + } + + /* disambiguate ties to force a deterministic order */ + if (ret == 0) + ret = sort_up_index_string(p1, p2); + return ret; +} + +/* sort_down_value --- descending value maybe-nums */ + +static int +sort_down_value(const void *p1, const void *p2) +{ + return -sort_up_value(p1, p2); +} + +/* sort_maybe_numeric_index --- special handling for sorting indices as numbers */ + +void +sort_maybe_numeric_index(qsort_compfunc func, NODE **list, + size_t num_elems, int setup) +{ + static const NODE empty_node; + + size_t i; + NODE temp_node; + struct sort_num *fake_node; + + /* if we're not sorting indices by numeric value, do nothing */ + if (func != sort_up_index_number && func != sort_down_index_number) + return; + + /* + * Insert a new struct in front of each index, so there's somewhere + * to store its numeric value after that's calculated once, avoiding + * having the qsort callback recalculate it for every comparison. + * Our caller will call us back to destroy this intermediate array + * as soon as qsort() finishes; the rest of gawk won't see it. + * We could use--or perhaps mis-use--gawk nodes here, but they'd tie + * up quite a bit more memory during the sort. + */ + if (setup) { + temp_node = empty_node; /* init union fields to 0,NULL */ + for (i = 0; i < num_elems; ++i) { + emalloc(fake_node, struct sort_num *, + sizeof (struct sort_num), "sort numeric index"); + /* populate enough fields to satisfy force_number(); + we don't need to maintain reference counts */ + temp_node.type = Node_val; + temp_node.stptr = list[i]->ahname_str; + temp_node.stlen = list[i]->ahname_len; + temp_node.flags = PERM | STRING | MAYBE_NUM; + /* insert the extra data in front of our input node */ + fake_node->sort_numbr = force_number(&temp_node); + fake_node->sort_index = list[i]; + list[i] = (NODE *) fake_node; + } + } else { /* post sort cleanup */ + for (i = 0; i < num_elems; ++i) { + /* undo the setup manipulations */ + fake_node = (struct sort_num *) list[i]; + list[i] = fake_node->sort_index; + efree((void *) fake_node); + } + } +} + +/* sort_match --- compare leading part of a string against a sort option */ + +static size_t +sort_match(const char *str, size_t slen, const char *target) +{ + const char *endofword; + + /* + * STR comes from the user and is SLEN characters long; + * if it has spaces then we care about the subset up until + * its first space rather than the full length. + * We check whether STR is the same as--or proper prefix + * of--TARGET. + * Accept "by-foo" or "as-foo" as a synonym match for "foo". + */ + endofword = strchr(str, ' '); + if (endofword != NULL) + slen = (size_t) (endofword - str); + /* try for exact match */ + if (slen > 0 && strncasecmp(str, target, slen) == 0) + return slen; + /* ingore "by-" or "as-" prefix and try again */ + if (slen > 3 + && (strncasecmp(str, "by-", 3) == 0 + || strncasecmp(str, "as-", 3) == 0) + && strncasecmp(str + 3, target, slen - 3) == 0) + return slen; + /* no match */ + return 0; +} + +/* + * sort_selection --- parse user-specified sort ordering + * ("ascending index" or "value number" and so forth); + * returns a qsort comparison function + */ + +static qsort_compfunc +sort_selection(NODE *r, const char *warn_arg, int default_by_value) +{ + enum sort_bits { + unrecognized = 0, + Unsorted = 0x01, + Ascending = 0x02, + Descending = 0x04, + by_Index = 0x08, + by_Value = 0x10, + as_String = 0x20, + as_Number = 0x40, + as_Inserted = 0x80, + allbitsused = 0xff + }; + /* + * The documented values are singular, but it's trivial to accept + * "index numbers" and "descending values" since we're using a + * prefix match. Latin plural of index is the only complication. + */ + static const struct sort_keys { + const char *const keyword; + enum sort_bits keybit; + } sorts[] = { + { "unsorted", Unsorted }, /* not part of a three-part key */ + { "ascending", Ascending }, /* ascending vs descending */ + { "descending", Descending }, + { "indexes", by_Index }, /* by_Index vs by_Number */ + { "indices", by_Index }, /* synonym for plural match */ + { "values", by_Value }, + { "strings", as_String }, /* as_String vs as_Number */ + { "numbers", as_Number }, + { "numeric", as_Number }, /* synonym; singular only */ + { "inserted", as_Inserted }, /* not part of a three-part key */ + { "insertion", as_Inserted }, /* another synonym */ + }; + + static short warned_unrecognized = FALSE; + static enum sort_bits prev_invalid = unrecognized; + + char *s; + size_t l, matchlen; + unsigned i; + const char *errfmt; + enum sort_bits allparts; + qsort_compfunc sort_func, default_sort_func; + + /* deduce our caller from the args provided */ + if (warn_arg != NULL) /* for-in statement */ + default_sort_func = (qsort_compfunc) 0; /* unsorted */ + else if (default_by_value) /* asort() */ + default_sort_func = sort_up_value; + else /* asorti() */ + default_sort_func = ! IGNORECASE ? sort_up_index_string + : sort_up_index_ignrcase; + + r = force_string(r); + s = r->stptr; + l = r->stlen; + + /* treat empty sort-order string as request for default */ + if (l == 0) + return default_sort_func; + + /* no ordering specification yet */ + allparts = unrecognized; + /* + * Scan through S until length L is exhausted, checking its + * starting word against each possible ordering choice and + * then skipping past that to get to the next word. If no + * choice matches, an error has been detected. Duplicate + * words are accepted; contradictory ones get noticed and + * rejected after the string parsing has completed. + */ + while (l > 0) { + matchlen = 0; /* lint suppression */ + for (i = 0; i < (sizeof sorts / sizeof *sorts); ++i) { + matchlen = sort_match(s, l, sorts[i].keyword); + if (matchlen > 0) { + allparts |= sorts[i].keybit; + break; + } + } + if (matchlen == 0) { + /* failed to match any possible component */ + allparts = unrecognized; + break; + } + /* move past the part just handled */ + s += matchlen, l -= matchlen; + /* (l > 0) here would accept a trailing space; (l > 1) won't */ + if (l > 1 && *s == ' ') + ++s, --l; + } /* while (l > 0) */ + + /* + * If we have one or two parts of a three part key, fill in + * whichever parts are missing with default values. For key + * type the default is specified by our caller, for direction + * it is Ascending, and for comparison mode it is as_String + * but only when sorting by_Index. When sorting by_Value, + * comparison mode is out of the user's control and we don't + * need or want to fill in either as_String or as_Number. + */ + if ((allparts & (Ascending|Descending|as_String|as_Number)) != 0 + && (allparts & (Unsorted|by_Index|by_Value|as_Inserted)) == 0) + allparts |= default_by_value ? by_Value : by_Index; + if ((allparts & (by_Index|by_Value|as_String|as_Number)) != 0 + && (allparts & (Unsorted|Ascending|Descending|as_Inserted)) == 0) + allparts |= Ascending; + /* by_Value is handled differently from by_Index */ + if ((allparts & (Ascending|Descending|by_Index)) != 0 + && (allparts & (Unsorted|by_Value|as_String|as_Number|as_Inserted)) == 0) + allparts |= as_String; + + /* + * ALLPARTS is a bit mask of all the user's specified ordering + * choices; partial specifications have been extended into full + * three part key. Valid combinations yield a qsort(3) comparison + * routine result. Invalid ones are rejected and yield default + * sort, with an error message for asort()/asorti(), a warning + * for for-in statement if this is a different invalid combination + * than the most recently warned about one, or a one-time warning + * if this is an unrecognized ordering specification and we're + * running in lint checking mode. + */ + sort_func = default_sort_func; + errfmt = NULL; + switch (allparts) { + case Unsorted: + sort_func = (qsort_compfunc) 0; + break; + case Ascending|by_Index|as_String: + sort_func = ! IGNORECASE ? sort_up_index_string + : sort_up_index_ignrcase; + break; + case Descending|by_Index|as_String: + sort_func = ! IGNORECASE ? sort_down_index_string + : sort_down_index_ignrcase; + break; + case Ascending|by_Index|as_Number: + sort_func = sort_up_index_number; + break; + case Descending|by_Index|as_Number: + sort_func = sort_down_index_number; + break; + case Ascending|by_Value: + sort_func = sort_up_value; + break; + case Descending|by_Value: + sort_func = sort_down_value; + break; + /* for value sorts, distinction between string and number is not + allowed; they're always ordered by basic awk MAYBE_NUM comparison */ + case Ascending|by_Value|as_String: + case Descending|by_Value|as_String: + if (errfmt == NULL) + errfmt = _("%s: sorting by value can't be forced to use string comparison"); + /* FALL THROUGH */ + case Ascending|by_Value|as_Number: + case Descending|by_Value|as_Number: + if (errfmt == NULL) + errfmt = _("%s: sorting by value can't be forced to use numeric comparison"); + /* FALL THROUGH */ + /* gawk doesn't maintain the data necessary to sort by + order of insertion */ + case as_Inserted: + if (errfmt == NULL) + errfmt = _("%s: sort ordering \"as-inserted\" is not implemented"); + /* FALL THROUGH */ + default: + /* sort_func still has default value */ + if (errfmt == NULL) + errfmt = (allparts == unrecognized) + ? _("%s: sort specification is not recognized") + : _("%s: sort specification is invalid"); + if (warn_arg == NULL) { + /* asort()/asorti() bad sort specification argument */ + error(errfmt, default_by_value ? "asort" : "asorti"); + } else if (allparts != unrecognized) { + /* invalid combination of valid ordering components */ + if (allparts != prev_invalid) + warning(errfmt, warn_arg); + prev_invalid = allparts; + } else if (do_lint) { + /* unrecognized ordering specification string */ + if (! warned_unrecognized) + lintwarn(errfmt, warn_arg); + warned_unrecognized = TRUE; + } + break; + } + return sort_func; +} + +/* + * sorted_in --- fetch value of PROCINFO["sorted_in"] and use sort_selection() + * to parse it; controls whether and how traversal of ``for (index in array)'' + * should be sorted; returns a qsort comparison function + */ + +qsort_compfunc +sorted_in(void) /* called from r_interpret(eval.c) */ +{ + static NODE *sorted_str = NULL; + static short warned_extension = FALSE; + + NODE *r; + + /* if there's no PROCINFO[], there's no ["sorted_in"], so no sorting */ + if (PROCINFO_node == NULL) + return (qsort_compfunc) 0; + + if (sorted_str == NULL) /* do this once */ + sorted_str = make_string("sorted_in", 9); + + r = (NODE *) NULL; + if (in_array(PROCINFO_node, sorted_str)) + r = *assoc_lookup(PROCINFO_node, sorted_str, TRUE); + /* if there's no PROCINFO["sorted_in"], there's no sorting */ + if (r == NULL) + return (qsort_compfunc) 0; + + /* we're going to make use of PROCINFO["sorted_in"] */ + if (do_lint && ! warned_extension) { + warned_extension = TRUE; + lintwarn(_("sorted array traversal is a gawk extension")); + } + + return sort_selection(r, "`PROCINFO[\"sorted_in\"]'", FALSE); +} + + /* From bonzini@gnu.org Mon Oct 28 16:05:26 2002 Date: Mon, 28 Oct 2002 13:33:03 +0100 |