diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2011-03-29 20:51:30 +0200 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2011-03-29 20:51:30 +0200 |
commit | 2400cc5143383a881356a9f55e93b60037d851e5 (patch) | |
tree | 3ba75f8b1225bf50e48b4f6c381e7772f1ab6c5b | |
parent | 4fe569fb78dd1b25822c16c9cac515a0fc6702a4 (diff) | |
download | egawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.gz egawk-2400cc5143383a881356a9f55e93b60037d851e5.tar.bz2 egawk-2400cc5143383a881356a9f55e93b60037d851e5.zip |
Revise array sorting for PROCINFO["sorted_in"].
-rw-r--r-- | ChangeLog | 18 | ||||
-rw-r--r-- | array.c | 545 | ||||
-rw-r--r-- | awk.h | 7 | ||||
-rw-r--r-- | doc/ChangeLog | 10 | ||||
-rw-r--r-- | doc/gawk.1 | 19 | ||||
-rw-r--r-- | doc/gawk.info | 794 | ||||
-rw-r--r-- | doc/gawk.texi | 104 | ||||
-rw-r--r-- | eval.c | 260 |
8 files changed, 1114 insertions, 643 deletions
@@ -1,3 +1,21 @@ +Tue Mar 29 20:45:49 2011 Pat Rankin <rankin@patechdata.com> + + Move the code to support sorting `for (index in array)' from + eval.c to array.c, and implement several additional orderings. + + * array.c (comp_func, sorted_in, sort_ignorecase, + sort_up_index_ignrcase, sort_down_index_ignrcase): Move from eval.c. + (sort_up_index_string, sort_down_index_string): Move from eval.c + and rename from *_str to *_string. + (sort_selection, sort_match, sort_maybe_numeric_index, + sort_up_index_number, sort_down_index_number, + sort_up_value, sort_down_value): New routines. + * eval.c (sort_&c): Move to array.c. + (r_interpret: case Op_arrayfor_init): Call sort_maybe_numeric_index + before and after qsort. + * awk.h (qsort_compfunc): New typedef. + (sorted_in, sort_maybe_numeric_index): Declare. + Fri Mar 25 13:15:36 2011 Arnold D. Robbins <arnold@skeeve.com> * awk.h: Move libsigsegv portability checks to here from main.c. @@ -54,6 +54,24 @@ static unsigned long awk_hash(const char *s, size_t len, unsigned long hsize, si unsigned long (*hash)(const char *s, size_t len, unsigned long hsize, size_t *code) = awk_hash; +/* used by sort_maybe_numeric_index() and sort_up_index_number() */ +struct sort_num { AWKNUM sort_numbr; NODE *sort_index; }; + +/* sorting support code */ +static qsort_compfunc sort_selection(NODE *, const char *, int); +static size_t sort_match(const char *, size_t, const char *); +static int sort_ignorecase(const char *, size_t, const char *, size_t); + +/* qsort callback routines */ +static int sort_up_index_string(const void *, const void *); +static int sort_down_index_string(const void *, const void *); +static int sort_up_index_ignrcase(const void *, const void *); +static int sort_down_index_ignrcase(const void *, const void *); +static int sort_up_index_number(const void *, const void *); +static int sort_down_index_number(const void *, const void *); +static int sort_up_value(const void *, const void *); +static int sort_down_value(const void *, const void *); + /* array_init --- possibly temporary function for experimentation purposes */ void @@ -1273,6 +1291,533 @@ do_asorti(int nargs) return asort_actual(nargs, INDEX); } + +/* comp_func --- array index comparison function for qsort, used in debug.c */ + +int +comp_func(const void *p1, const void *p2) +{ + return sort_up_index_string(p1, p2); +} + +/* sort_ignorecase --- case insensitive string comparison from cmp_nodes() */ + +static int +sort_ignorecase(const char *s1, size_t l1, const char *s2, size_t l2) +{ + size_t l; + int ret; + + l = (l1 < l2) ? l1 : l2; +#ifdef MBS_SUPPORT + if (gawk_mb_cur_max > 1) { + ret = strncasecmpmbs((const unsigned char *) s1, + (const unsigned char *) s2, l); + } else +#endif + for (ret = 0; l-- > 0 && ret == 0; s1++, s2++) + ret = casetable[*(unsigned char *) s1] + - casetable[*(unsigned char *) s2]; + if (ret == 0 && l1 != l2) + ret = (l1 < l2) ? -1 : 1; + return ret; +} + +/* + * sort_up_index_string --- qsort comparison function; ascending index strings; + * index strings are distinct within an array, so no comparisons will yield + * equal and warrant disambiguation + */ + +static int +sort_up_index_string(const void *p1, const void *p2) +{ + const NODE *t1, *t2; + const char *str1, *str2; + size_t len1, len2; + int ret; + + /* Array indexes are strings and distinct, never equal */ + t1 = *((const NODE *const *) p1); + t2 = *((const NODE *const *) p2); + + str1 = t1->ahname_str; + len1 = t1->ahname_len; + str2 = t2->ahname_str; + len2 = t2->ahname_len; + + ret = memcmp(str1, str2, (len1 < len2) ? len1 : len2); + /* + * if they compared equal but their lengths differ, the + * shorter one is a prefix of the longer and compares as less + */ + if (ret == 0 && len1 != len2) + ret = (len1 < len2) ? -1 : 1; + + /* indices are unique within an array, so result should never be 0 */ + assert(ret != 0); + return ret; +} + +/* sort_down_index_string --- descending index strings */ + +static int +sort_down_index_string(const void *p1, const void *p2) +{ + /* + * Negation versus transposed arguments: when all keys are + * distinct, as with array indices here, either method will + * transform an ascending sort into a descending one. But if + * there are equal keys--such as when IGNORECASE is honored-- + * that get disambiguated into a determisitc order, negation + * will reverse those but transposed arguments would retain + * their relative order within the rest of the reversed sort. + */ + return -sort_up_index_string(p1, p2); +} + +/* + * sort_up_index_ignrcase --- ascending index string, case insensitive; + * case insensitive comparison can cause distinct index strings to compare + * equal, so disambiguation in necessary + */ + +static int +sort_up_index_ignrcase(const void *p1, const void *p2) +{ + const NODE *t1, *t2; + int ret; + + t1 = *((const NODE *const *) p1); + t2 = *((const NODE *const *) p2); + + ret = sort_ignorecase(t1->ahname_str, t1->ahname_len, + t2->ahname_str, t2->ahname_len); + + /* + * if case insensitive result is "they're the same", + * use case sensitive comparison to force distinct order + */ + if (ret == 0) + ret = sort_up_index_string(p1, p2); + return ret; +} + +/* sort_down_index_ignrcase --- descending index strings, case insensitive */ + +static int +sort_down_index_ignrcase(const void *p1, const void *p2) +{ + return -sort_up_index_ignrcase(p1, p2); +} + +/* + * sort_up_index_number --- qsort comparison function; ascending index numbers; + * the args have been built for us by sort_maybe_numeric_index() and point + * to custom structs rather than to gawk nodes + */ + +static int +sort_up_index_number(const void *p1, const void *p2) +{ + const struct sort_num *n1, *n2; + int ret; + + n1 = *((const struct sort_num *const *) p1); + n2 = *((const struct sort_num *const *) p2); + + if (n1->sort_numbr < n2->sort_numbr) + ret = -1; + else + ret = (n1->sort_numbr > n2->sort_numbr); + + /* break a tie with the index string itself */ + if (ret == 0) + return sort_up_index_string((const void *) n1->sort_index, + (const void *) n2->sort_index); + return ret; +} + +/* sort_down_index_number --- descending index numbers */ + +static int +sort_down_index_number(const void *p1, const void *p2) +{ + return -sort_up_index_number(p1, p2); +} + +/* + * sort_up_value --- qsort comparison function; ascending values; + * standard awk comparison rules apply, so might be by number or might + * be by string depending upon the values involved; if a value happens + * to be a sub-array, it will compare greater than any scalar (last in + * ascending order) and equal with any other sub-array (disambiguated + * by index string, the same as with equal scalar values) + */ + +static int +sort_up_value(const void *p1, const void *p2) +{ + const NODE *idx1, *idx2; + NODE *val1, *val2; + int ret; + + /* we're passed a pair of index (array subscript) nodes */ + idx1 = *(const NODE *const *) p1; + idx2 = *(const NODE *const *) p2; + if (idx1->type != Node_ahash || idx2->type != Node_ahash) + cant_happen(); + + /* and we want to compare the element values they refer to */ + val1 = idx1->hvalue; + val2 = idx2->hvalue; + if (val1->type == Node_var_array) { + if (val2->type == Node_var_array) + ret = 0; /* both sub-arrays, hence tie */ + else + ret = 1; /* val1 (sub-array) > val2 (scalar) */ + } else { + if (val2->type == Node_var_array) + ret = -1; /* val1 (scalar) < val2 (sub-array) */ + else + ret = cmp_nodes(val1, val2); /* both scalars */ + } + + /* disambiguate ties to force a deterministic order */ + if (ret == 0) + ret = sort_up_index_string(p1, p2); + return ret; +} + +/* sort_down_value --- descending value maybe-nums */ + +static int +sort_down_value(const void *p1, const void *p2) +{ + return -sort_up_value(p1, p2); +} + +/* sort_maybe_numeric_index --- special handling for sorting indices as numbers */ + +void +sort_maybe_numeric_index(qsort_compfunc func, NODE **list, + size_t num_elems, int setup) +{ + static const NODE empty_node; + + size_t i; + NODE temp_node; + struct sort_num *fake_node; + + /* if we're not sorting indices by numeric value, do nothing */ + if (func != sort_up_index_number && func != sort_down_index_number) + return; + + /* + * Insert a new struct in front of each index, so there's somewhere + * to store its numeric value after that's calculated once, avoiding + * having the qsort callback recalculate it for every comparison. + * Our caller will call us back to destroy this intermediate array + * as soon as qsort() finishes; the rest of gawk won't see it. + * We could use--or perhaps mis-use--gawk nodes here, but they'd tie + * up quite a bit more memory during the sort. + */ + if (setup) { + temp_node = empty_node; /* init union fields to 0,NULL */ + for (i = 0; i < num_elems; ++i) { + emalloc(fake_node, struct sort_num *, + sizeof (struct sort_num), "sort numeric index"); + /* populate enough fields to satisfy force_number(); + we don't need to maintain reference counts */ + temp_node.type = Node_val; + temp_node.stptr = list[i]->ahname_str; + temp_node.stlen = list[i]->ahname_len; + temp_node.flags = PERM | STRING | MAYBE_NUM; + /* insert the extra data in front of our input node */ + fake_node->sort_numbr = force_number(&temp_node); + fake_node->sort_index = list[i]; + list[i] = (NODE *) fake_node; + } + } else { /* post sort cleanup */ + for (i = 0; i < num_elems; ++i) { + /* undo the setup manipulations */ + fake_node = (struct sort_num *) list[i]; + list[i] = fake_node->sort_index; + efree((void *) fake_node); + } + } +} + +/* sort_match --- compare leading part of a string against a sort option */ + +static size_t +sort_match(const char *str, size_t slen, const char *target) +{ + const char *endofword; + + /* + * STR comes from the user and is SLEN characters long; + * if it has spaces then we care about the subset up until + * its first space rather than the full length. + * We check whether STR is the same as--or proper prefix + * of--TARGET. + * Accept "by-foo" or "as-foo" as a synonym match for "foo". + */ + endofword = strchr(str, ' '); + if (endofword != NULL) + slen = (size_t) (endofword - str); + /* try for exact match */ + if (slen > 0 && strncasecmp(str, target, slen) == 0) + return slen; + /* ingore "by-" or "as-" prefix and try again */ + if (slen > 3 + && (strncasecmp(str, "by-", 3) == 0 + || strncasecmp(str, "as-", 3) == 0) + && strncasecmp(str + 3, target, slen - 3) == 0) + return slen; + /* no match */ + return 0; +} + +/* + * sort_selection --- parse user-specified sort ordering + * ("ascending index" or "value number" and so forth); + * returns a qsort comparison function + */ + +static qsort_compfunc +sort_selection(NODE *r, const char *warn_arg, int default_by_value) +{ + enum sort_bits { + unrecognized = 0, + Unsorted = 0x01, + Ascending = 0x02, + Descending = 0x04, + by_Index = 0x08, + by_Value = 0x10, + as_String = 0x20, + as_Number = 0x40, + as_Inserted = 0x80, + allbitsused = 0xff + }; + /* + * The documented values are singular, but it's trivial to accept + * "index numbers" and "descending values" since we're using a + * prefix match. Latin plural of index is the only complication. + */ + static const struct sort_keys { + const char *const keyword; + enum sort_bits keybit; + } sorts[] = { + { "unsorted", Unsorted }, /* not part of a three-part key */ + { "ascending", Ascending }, /* ascending vs descending */ + { "descending", Descending }, + { "indexes", by_Index }, /* by_Index vs by_Number */ + { "indices", by_Index }, /* synonym for plural match */ + { "values", by_Value }, + { "strings", as_String }, /* as_String vs as_Number */ + { "numbers", as_Number }, + { "numeric", as_Number }, /* synonym; singular only */ + { "inserted", as_Inserted }, /* not part of a three-part key */ + { "insertion", as_Inserted }, /* another synonym */ + }; + + static short warned_unrecognized = FALSE; + static enum sort_bits prev_invalid = unrecognized; + + char *s; + size_t l, matchlen; + unsigned i; + const char *errfmt; + enum sort_bits allparts; + qsort_compfunc sort_func, default_sort_func; + + /* deduce our caller from the args provided */ + if (warn_arg != NULL) /* for-in statement */ + default_sort_func = (qsort_compfunc) 0; /* unsorted */ + else if (default_by_value) /* asort() */ + default_sort_func = sort_up_value; + else /* asorti() */ + default_sort_func = ! IGNORECASE ? sort_up_index_string + : sort_up_index_ignrcase; + + r = force_string(r); + s = r->stptr; + l = r->stlen; + + /* treat empty sort-order string as request for default */ + if (l == 0) + return default_sort_func; + + /* no ordering specification yet */ + allparts = unrecognized; + /* + * Scan through S until length L is exhausted, checking its + * starting word against each possible ordering choice and + * then skipping past that to get to the next word. If no + * choice matches, an error has been detected. Duplicate + * words are accepted; contradictory ones get noticed and + * rejected after the string parsing has completed. + */ + while (l > 0) { + matchlen = 0; /* lint suppression */ + for (i = 0; i < (sizeof sorts / sizeof *sorts); ++i) { + matchlen = sort_match(s, l, sorts[i].keyword); + if (matchlen > 0) { + allparts |= sorts[i].keybit; + break; + } + } + if (matchlen == 0) { + /* failed to match any possible component */ + allparts = unrecognized; + break; + } + /* move past the part just handled */ + s += matchlen, l -= matchlen; + /* (l > 0) here would accept a trailing space; (l > 1) won't */ + if (l > 1 && *s == ' ') + ++s, --l; + } /* while (l > 0) */ + + /* + * If we have one or two parts of a three part key, fill in + * whichever parts are missing with default values. For key + * type the default is specified by our caller, for direction + * it is Ascending, and for comparison mode it is as_String + * but only when sorting by_Index. When sorting by_Value, + * comparison mode is out of the user's control and we don't + * need or want to fill in either as_String or as_Number. + */ + if ((allparts & (Ascending|Descending|as_String|as_Number)) != 0 + && (allparts & (Unsorted|by_Index|by_Value|as_Inserted)) == 0) + allparts |= default_by_value ? by_Value : by_Index; + if ((allparts & (by_Index|by_Value|as_String|as_Number)) != 0 + && (allparts & (Unsorted|Ascending|Descending|as_Inserted)) == 0) + allparts |= Ascending; + /* by_Value is handled differently from by_Index */ + if ((allparts & (Ascending|Descending|by_Index)) != 0 + && (allparts & (Unsorted|by_Value|as_String|as_Number|as_Inserted)) == 0) + allparts |= as_String; + + /* + * ALLPARTS is a bit mask of all the user's specified ordering + * choices; partial specifications have been extended into full + * three part key. Valid combinations yield a qsort(3) comparison + * routine result. Invalid ones are rejected and yield default + * sort, with an error message for asort()/asorti(), a warning + * for for-in statement if this is a different invalid combination + * than the most recently warned about one, or a one-time warning + * if this is an unrecognized ordering specification and we're + * running in lint checking mode. + */ + sort_func = default_sort_func; + errfmt = NULL; + switch (allparts) { + case Unsorted: + sort_func = (qsort_compfunc) 0; + break; + case Ascending|by_Index|as_String: + sort_func = ! IGNORECASE ? sort_up_index_string + : sort_up_index_ignrcase; + break; + case Descending|by_Index|as_String: + sort_func = ! IGNORECASE ? sort_down_index_string + : sort_down_index_ignrcase; + break; + case Ascending|by_Index|as_Number: + sort_func = sort_up_index_number; + break; + case Descending|by_Index|as_Number: + sort_func = sort_down_index_number; + break; + case Ascending|by_Value: + sort_func = sort_up_value; + break; + case Descending|by_Value: + sort_func = sort_down_value; + break; + /* for value sorts, distinction between string and number is not + allowed; they're always ordered by basic awk MAYBE_NUM comparison */ + case Ascending|by_Value|as_String: + case Descending|by_Value|as_String: + if (errfmt == NULL) + errfmt = _("%s: sorting by value can't be forced to use string comparison"); + /* FALL THROUGH */ + case Ascending|by_Value|as_Number: + case Descending|by_Value|as_Number: + if (errfmt == NULL) + errfmt = _("%s: sorting by value can't be forced to use numeric comparison"); + /* FALL THROUGH */ + /* gawk doesn't maintain the data necessary to sort by + order of insertion */ + case as_Inserted: + if (errfmt == NULL) + errfmt = _("%s: sort ordering \"as-inserted\" is not implemented"); + /* FALL THROUGH */ + default: + /* sort_func still has default value */ + if (errfmt == NULL) + errfmt = (allparts == unrecognized) + ? _("%s: sort specification is not recognized") + : _("%s: sort specification is invalid"); + if (warn_arg == NULL) { + /* asort()/asorti() bad sort specification argument */ + error(errfmt, default_by_value ? "asort" : "asorti"); + } else if (allparts != unrecognized) { + /* invalid combination of valid ordering components */ + if (allparts != prev_invalid) + warning(errfmt, warn_arg); + prev_invalid = allparts; + } else if (do_lint) { + /* unrecognized ordering specification string */ + if (! warned_unrecognized) + lintwarn(errfmt, warn_arg); + warned_unrecognized = TRUE; + } + break; + } + return sort_func; +} + +/* + * sorted_in --- fetch value of PROCINFO["sorted_in"] and use sort_selection() + * to parse it; controls whether and how traversal of ``for (index in array)'' + * should be sorted; returns a qsort comparison function + */ + +qsort_compfunc +sorted_in(void) /* called from r_interpret(eval.c) */ +{ + static NODE *sorted_str = NULL; + static short warned_extension = FALSE; + + NODE *r; + + /* if there's no PROCINFO[], there's no ["sorted_in"], so no sorting */ + if (PROCINFO_node == NULL) + return (qsort_compfunc) 0; + + if (sorted_str == NULL) /* do this once */ + sorted_str = make_string("sorted_in", 9); + + r = (NODE *) NULL; + if (in_array(PROCINFO_node, sorted_str)) + r = *assoc_lookup(PROCINFO_node, sorted_str, TRUE); + /* if there's no PROCINFO["sorted_in"], there's no sorting */ + if (r == NULL) + return (qsort_compfunc) 0; + + /* we're going to make use of PROCINFO["sorted_in"] */ + if (do_lint && ! warned_extension) { + warned_extension = TRUE; + lintwarn(_("sorted array traversal is a gawk extension")); + } + + return sort_selection(r, "`PROCINFO[\"sorted_in\"]'", FALSE); +} + + /* From bonzini@gnu.org Mon Oct 28 16:05:26 2002 Date: Mon, 28 Oct 2002 13:33:03 +0100 @@ -846,6 +846,9 @@ struct flagtab { #endif #define UNLIMITED LONG_MAX +/* qsort comparison function */ +typedef int (*qsort_compfunc)(const void *,const void *); + /* -------------------------- External variables -------------------------- */ /* gawk builtin variables */ extern long NF; @@ -1123,6 +1126,9 @@ extern NODE *assoc_dump(NODE *symbol, int indent_level); extern NODE *do_adump(int nargs); extern NODE *do_asort(int nargs); extern NODE *do_asorti(int nargs); +extern int comp_func(const void *p1, const void *p2); +extern qsort_compfunc sorted_in(void); +extern void sort_maybe_numeric_index(qsort_compfunc, NODE **, size_t, int); extern unsigned long (*hash)(const char *s, size_t len, unsigned long hsize, size_t *code); /* awkgram.c */ extern NODE *mk_symbol(NODETYPE type, NODE *value); @@ -1205,7 +1211,6 @@ extern void PUSH_CODE(INSTRUCTION *cp); extern INSTRUCTION *POP_CODE(void); extern int interpret(INSTRUCTION *); extern int cmp_nodes(NODE *, NODE *); -extern int comp_func(const void *p1, const void *p2); extern void set_IGNORECASE(void); extern void set_OFS(void); extern void set_ORS(void); diff --git a/doc/ChangeLog b/doc/ChangeLog index 9c2ad6ec..2f25cb6a 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,13 @@ +Sun Mar 27 21:10:55 2011 Pat Rankin <rankin@pactechdata.com> + + * gawk.texi (Builit-in Variables: PROCINFO array, Scanning All + Elements of an Array: `for' statement): Update the documentation + for PROCINFO["sorted_in"]; add "ascending index number", + "descending index string", "ascending value", and "descending + value" as supported sort orderings. + * gawk.1 (PROCINFO array): Update PROCINFO["sorted_in"] to + reflect that the value matters, and list the supported sort orders. + Tue Feb 15 17:11:26 2011 Pat Rankin <rankin@pactechdata.com> * gawk.texi (Builit-in Variables: PROCINFO array, Scanning All @@ -1082,13 +1082,20 @@ system call. \fBPROCINFO["sorted_in"]\fP If this element exists in .BR PROCINFO , -.IR "no matter what its value" , -then -.I gawk -will cause array +then its value controls the order in which array elements +are traversed in .B for -loops -to traverse the array indices in sorted order. +loops. +Supported values are +\fB"ascending index string"\fR, +\fB"ascending index number"\fR, +\fB"ascending value"\fR, +\fB"descending index string"\fR, +\fB"descending index number"\fR, +\fB"descending value"\fR, and +\fB"unsorted"\fR. +The order specification words can be truncated, or omitted (provided +that at least one is present), or given in any order. .TP \fBPROCINFO["version"]\fP the version of diff --git a/doc/gawk.info b/doc/gawk.info index 714df826..f084fa3f 100644 --- a/doc/gawk.info +++ b/doc/gawk.info @@ -3633,7 +3633,7 @@ better performance when reading records. Otherwise, `gawk' has to make several function calls, _per input character_, to find the record terminator. - According to POSIX, string conmparison is also affected by locales + According to POSIX, string comparison is also affected by locales (similar to regular expressions). The details are presented in *note POSIX String Comparison::. @@ -9408,18 +9408,32 @@ with a pound sign (`#'). `PROCINFO["sorted_in"]' If this element exists in `PROCINFO', its value controls the - order in which array indices will be processed by `for(i in - arr) ...' loops. A value of `"ascending index string"', - which may be shortened to `"ascending index"' or just - `"ascending"', will result in either case sensitive or case - insensitive ascending order depending upon the value of - `IGNORECASE'. A value of `"descending index string"', which - may be shortened in a similar manner, will result in the - opposite order. The value `"unsorted"' is also recognized, - yielding the default result of arbitrary order. Any other - value will be ignored, and warned about (at the time of first - `for(in in arr) ...' execution) when lint checking is enabled. - *Note Scanning an Array::, for more information. + order in which array indices will be processed by `for (index + in array) ...' loops. The value should contain one to three + words; separate pairs of words by a single space. One word + controls sort direction, "ascending" or "descending;" another + controls the sort key, "index" or "value;" and the remaining + one, which is only valid for sorting by index, is comparison + mode, "string" or "number." When two or three words are + present, they may be specified in any order, so `ascending + index string' and `string ascending index' are equivalent. + Also, each word may be truncated, so `asc index str' and `a i + s' are also equivalent. Note that a separating space is + required even when the words have been shortened down to one + letter each. + + You can omit direction and/or key type and/or comparison + mode. Provided that at least one is present, missing parts + of a sort specification default to `ascending', `index', and + (for indices only) `string', respectively. An empty string, + `""', is the same as `unsorted' and will cause `for (index in + array) ...' to process the indices in arbitrary order. + Another thing to note is that the array sorting takes place + at the time `for (... in ...)' is about to start executing, + so changing the value of `PROCINFO["sorted_in"]' during loop + execution does not have any effect on the order in which any + remaining array elements get processed. *Note Scanning an + Array::, for more information. `PROCINFO["strftime"]' The default time format string for `strftime()'. Assigning a @@ -9938,15 +9952,43 @@ produce strange results. It is best to avoid such things. As an extension, `gawk' makes it possible for you to loop over the elements of an array in order, based on the value of -`PROCINFO["sorted_in"]' (*note Auto-set::). At present two sorting -options are available: `"ascending index string"' and `"descending -index string"'. They can be shortened by omitting `string' or `index -string'. The value `"unsorted"' can be used as an explicit "no-op" and -yields the same result as when `PROCINFO["sorted_in"]' has no value at -all. If the index strings contain letters, the value of `IGNORECASE' -affects the order of the result. This extension is disabled in POSIX -mode, since the `PROCINFO' array is not special in that case. For -example: +`PROCINFO["sorted_in"]' (*note Auto-set::). Several sorting options +are available: + +`"ascending index string"' + Order by indices compared as strings, the most basic sort. + (Internally, array indices are always strings, so with `a[2*5] = 1' + the index is actually `"10"' rather than numeric 10.) + +`"ascending index number"' + Order by indices but force them to be treated as numbers in the + process. Any index with non-numeric value will end up positioned + as if it were 0. + +`"ascending value"' + Order by element values rather than by indices. Comparisons are + done as numeric when both values being compared are numeric, or + done as strings when either or both aren't numeric. Sub-arrays, + if present, come out last. + +`"descending index string"' + Reverse order from the most basic sort. + +`"descending index number"' + Numeric indices ordered from high to low. + +`"descending value"' + Element values ordered from high to low. Sub-arrays, if present, + come out first. + +`"unsorted"' + Array elements are processed in arbitrary order, the normal `awk' + behavior. + + Portions of the sort specification string may be truncated or +omitted. The default is `ascending' for direction, `index' for sort +key type, and (when sorting by index only) `string' for comparison mode. +For example: $ gawk 'BEGIN { > a[4] = 4 @@ -9957,7 +9999,7 @@ example: -| 4 4 -| 3 3 $ gawk 'BEGIN { - > PROCINFO["sorted_in"] = "ascending index" + > PROCINFO["sorted_in"] = "asc index" > a[4] = 4 > a[3] = 3 > for (i in a) @@ -9971,6 +10013,26 @@ array has been reported to add 15% to 20% overhead to the execution time of `awk' programs. For this reason, sorted array traversal is not the default. + When sorting an array by element values, if a value happens to be a +sub-array then it is considered to be greater than any string or +numeric value, regardless of what the sub-array itself contains, and +all sub-arrays are treated as being equal to each other. Their order +relative to each other is determined by their index strings. + + Sorting by array element values (for values other than sub-arrays) +always uses basic `awk' comparison mode: if both values happen to be +numbers then they're compared as numbers, otherwise they're compared as +strings. + + When string comparisons are made during a sort, either for element +values where one or both aren't numbers or for element indices handled +as strings, the value of `IGNORECASE' controls whether the comparisons +treat corresponding upper and lower case letters as equivalent or +distinct. + + This sorting extension is disabled in POSIX mode, since the +`PROCINFO' array is not special in that case. + File: gawk.info, Node: Delete, Next: Numeric Array Subscripts, Prev: Array Basics, Up: Arrays @@ -24437,7 +24499,7 @@ Index (line 67) * advanced features, data files as single record: Records. (line 175) * advanced features, fixed-width data: Constant Size. (line 9) -* advanced features, FNR/NR variables: Auto-set. (line 215) +* advanced features, FNR/NR variables: Auto-set. (line 229) * advanced features, gawk: Advanced Features. (line 6) * advanced features, gawk, network programming: TCP/IP Networking. (line 6) @@ -24937,7 +24999,7 @@ Index (line 47) * dark corner, FILENAME variable <1>: Auto-set. (line 92) * dark corner, FILENAME variable: Getline Notes. (line 19) -* dark corner, FNR/NR variables: Auto-set. (line 215) +* dark corner, FNR/NR variables: Auto-set. (line 229) * dark corner, format-control characters: Control Letters. (line 18) * dark corner, FS as null string: Single Character Fields. (line 20) @@ -25136,7 +25198,7 @@ Index * differences in awk and gawk, regular expressions: Case-sensitivity. (line 26) * differences in awk and gawk, RS/RT variables: Records. (line 167) -* differences in awk and gawk, RT variable: Auto-set. (line 204) +* differences in awk and gawk, RT variable: Auto-set. (line 218) * differences in awk and gawk, single-character fields: Single Character Fields. (line 6) * differences in awk and gawk, split() function: String Functions. @@ -25417,7 +25479,7 @@ Index * floating-point, numbers, AWKNUM internal type: Internals. (line 19) * FNR variable <1>: Auto-set. (line 102) * FNR variable: Records. (line 6) -* FNR variable, changing: Auto-set. (line 215) +* FNR variable, changing: Auto-set. (line 229) * for statement: For Statement. (line 6) * for statement, in arrays: Scanning an Array. (line 20) * force_number() internal function: Internals. (line 27) @@ -25591,7 +25653,7 @@ Index * gawk, regular expressions, operators: GNU Regexp Operators. (line 6) * gawk, regular expressions, precedence: Regexp Operators. (line 157) -* gawk, RT variable in <1>: Auto-set. (line 204) +* gawk, RT variable in <1>: Auto-set. (line 218) * gawk, RT variable in <2>: Getline/Variable/File. (line 10) * gawk, RT variable in <3>: Multiple Line. (line 129) @@ -26038,7 +26100,7 @@ Index * not Boolean-logic operator: Boolean Ops. (line 6) * NR variable <1>: Auto-set. (line 118) * NR variable: Records. (line 6) -* NR variable, changing: Auto-set. (line 215) +* NR variable, changing: Auto-set. (line 229) * null strings <1>: Basic Data Typing. (line 50) * null strings <2>: Truth Values. (line 6) * null strings <3>: Regexp Field Splitting. @@ -26470,7 +26532,7 @@ Index * right angle bracket (>), >> operator (I/O): Redirection. (line 50) * right shift, bitwise: Bitwise Functions. (line 32) * Ritchie, Dennis: Basic Data Typing. (line 74) -* RLENGTH variable: Auto-set. (line 191) +* RLENGTH variable: Auto-set. (line 205) * RLENGTH variable, match() function and: String Functions. (line 205) * Robbins, Arnold <1>: Future Extensions. (line 6) * Robbins, Arnold <2>: Bugs. (line 32) @@ -26495,9 +26557,9 @@ Index * RS variable: Records. (line 20) * RS variable, multiline records and: Multiple Line. (line 17) * rshift() function (gawk): Bitwise Functions. (line 51) -* RSTART variable: Auto-set. (line 197) +* RSTART variable: Auto-set. (line 211) * RSTART variable, match() function and: String Functions. (line 205) -* RT variable <1>: Auto-set. (line 204) +* RT variable <1>: Auto-set. (line 218) * RT variable <2>: Getline/Variable/File. (line 10) * RT variable <3>: Multiple Line. (line 129) @@ -27008,339 +27070,339 @@ Ref: Case-sensitivity-Footnote-2154459 Node: Leftmost Longest154567 Node: Computed Regexps155768 Node: Locales159194 -Node: Reading Files162902 -Node: Records164843 -Ref: Records-Footnote-1173517 -Node: Fields173554 -Ref: Fields-Footnote-1176587 -Node: Nonconstant Fields176673 -Node: Changing Fields178875 -Node: Field Separators184853 -Node: Default Field Splitting187482 -Node: Regexp Field Splitting188599 -Node: Single Character Fields191941 -Node: Command Line Field Separator193000 -Node: Field Splitting Summary196441 -Ref: Field Splitting Summary-Footnote-1199633 -Node: Constant Size199734 -Node: Splitting By Content204318 -Ref: Splitting By Content-Footnote-1208044 -Node: Multiple Line208084 -Ref: Multiple Line-Footnote-1213931 -Node: Getline214110 -Node: Plain Getline216338 -Node: Getline/Variable218427 -Node: Getline/File219568 -Node: Getline/Variable/File220890 -Ref: Getline/Variable/File-Footnote-1222489 -Node: Getline/Pipe222576 -Node: Getline/Variable/Pipe225136 -Node: Getline/Coprocess226243 -Node: Getline/Variable/Coprocess227486 -Node: Getline Notes228200 -Node: Getline Summary230142 -Ref: table-getline-variants230485 -Node: Command line directories231341 -Node: Printing231966 -Node: Print233597 -Node: Print Examples234934 -Node: Output Separators237718 -Node: OFMT239478 -Node: Printf240836 -Node: Basic Printf241742 -Node: Control Letters243281 -Node: Format Modifiers247093 -Node: Printf Examples253102 -Node: Redirection255817 -Node: Special Files262801 -Node: Special FD263334 -Ref: Special FD-Footnote-1266958 -Node: Special Network267032 -Node: Special Caveats267882 -Node: Close Files And Pipes268678 -Ref: Close Files And Pipes-Footnote-1275701 -Ref: Close Files And Pipes-Footnote-2275849 -Node: Expressions275999 -Node: Values277068 -Node: Constants277744 -Node: Scalar Constants278424 -Ref: Scalar Constants-Footnote-1279283 -Node: Nondecimal-numbers279465 -Node: Regexp Constants282524 -Node: Using Constant Regexps282999 -Node: Variables286054 -Node: Using Variables286709 -Node: Assignment Options288433 -Node: Conversion290305 -Ref: table-locale-affects295681 -Ref: Conversion-Footnote-1296305 -Node: All Operators296414 -Node: Arithmetic Ops297044 -Node: Concatenation299549 -Ref: Concatenation-Footnote-1302342 -Node: Assignment Ops302462 -Ref: table-assign-ops307450 -Node: Increment Ops308858 -Node: Truth Values and Conditions312328 -Node: Truth Values313411 -Node: Typing and Comparison314460 -Node: Variable Typing315249 -Ref: Variable Typing-Footnote-1319146 -Node: Comparison Operators319268 -Ref: table-relational-ops319678 -Node: POSIX String Comparison323227 -Ref: POSIX String Comparison-Footnote-1324183 -Node: Boolean Ops324321 -Ref: Boolean Ops-Footnote-1328399 -Node: Conditional Exp328490 -Node: Function Calls330222 -Node: Precedence333816 -Node: Patterns and Actions337469 -Node: Pattern Overview338523 -Node: Regexp Patterns340189 -Node: Expression Patterns340732 -Node: Ranges344306 -Node: BEGIN/END347272 -Node: Using BEGIN/END348034 -Ref: Using BEGIN/END-Footnote-1350765 -Node: I/O And BEGIN/END350871 -Node: BEGINFILE/ENDFILE353153 -Node: Empty355984 -Node: Using Shell Variables356300 -Node: Action Overview358585 -Node: Statements360942 -Node: If Statement362796 -Node: While Statement364295 -Node: Do Statement366339 -Node: For Statement367495 -Node: Switch Statement370647 -Node: Break Statement372744 -Node: Continue Statement374734 -Node: Next Statement376521 -Node: Nextfile Statement378911 -Node: Exit Statement381387 -Node: Built-in Variables383803 -Node: User-modified384898 -Ref: User-modified-Footnote-1392924 -Node: Auto-set392986 -Ref: Auto-set-Footnote-1402861 -Node: ARGC and ARGV403066 -Node: Arrays406917 -Node: Array Basics408488 -Node: Array Intro409199 -Node: Reference to Elements413517 -Node: Assigning Elements415787 -Node: Array Example416278 -Node: Scanning an Array418010 -Node: Delete421542 -Ref: Delete-Footnote-1423977 -Node: Numeric Array Subscripts424034 -Node: Uninitialized Subscripts426217 -Node: Multi-dimensional427845 -Node: Multi-scanning430936 -Node: Array Sorting432520 -Ref: Array Sorting-Footnote-1435614 -Node: Arrays of Arrays435808 -Node: Functions440381 -Node: Built-in441203 -Node: Calling Built-in442281 -Node: Numeric Functions444269 -Ref: Numeric Functions-Footnote-1448034 -Ref: Numeric Functions-Footnote-2448391 -Ref: Numeric Functions-Footnote-3448439 -Node: String Functions448708 -Ref: String Functions-Footnote-1471210 -Ref: String Functions-Footnote-2471339 -Ref: String Functions-Footnote-3471587 -Node: Gory Details471674 -Ref: table-sub-escapes473353 -Ref: table-posix-sub474667 -Ref: table-gensub-escapes475580 -Node: I/O Functions476751 -Ref: I/O Functions-Footnote-1483406 -Node: Time Functions483553 -Ref: Time Functions-Footnote-1494448 -Ref: Time Functions-Footnote-2494516 -Ref: Time Functions-Footnote-3494674 -Ref: Time Functions-Footnote-4494785 -Ref: Time Functions-Footnote-5494897 -Ref: Time Functions-Footnote-6495124 -Node: Bitwise Functions495390 -Ref: table-bitwise-ops495948 -Ref: Bitwise Functions-Footnote-1500108 -Node: Type Functions500292 -Node: I18N Functions500762 -Node: User-defined502389 -Node: Definition Syntax503193 -Ref: Definition Syntax-Footnote-1508103 -Node: Function Example508172 -Node: Function Caveats510766 -Node: Calling A Function511187 -Node: Variable Scope512302 -Node: Pass By Value/Reference514277 -Node: Return Statement517717 -Node: Dynamic Typing520698 -Node: Indirect Calls521433 -Node: Internationalization531118 -Node: I18N and L10N532544 -Node: Explaining gettext533230 -Ref: Explaining gettext-Footnote-1538296 -Ref: Explaining gettext-Footnote-2538480 -Node: Programmer i18n538645 -Node: Translator i18n542845 -Node: String Extraction543638 -Ref: String Extraction-Footnote-1544599 -Node: Printf Ordering544685 -Ref: Printf Ordering-Footnote-1547469 -Node: I18N Portability547533 -Ref: I18N Portability-Footnote-1549982 -Node: I18N Example550045 -Ref: I18N Example-Footnote-1552680 -Node: Gawk I18N552752 -Node: Advanced Features553369 -Node: Nondecimal Data554688 -Node: Two-way I/O556269 -Ref: Two-way I/O-Footnote-1561703 -Node: TCP/IP Networking561773 -Node: Profiling564617 -Node: Library Functions572091 -Ref: Library Functions-Footnote-1575196 -Node: Library Names575367 -Ref: Library Names-Footnote-1578838 -Ref: Library Names-Footnote-2579058 -Node: General Functions579144 -Node: Nextfile Function580207 -Node: Strtonum Function584588 -Node: Assert Function587544 -Node: Round Function590870 -Node: Cliff Random Function592413 -Node: Ordinal Functions593429 -Ref: Ordinal Functions-Footnote-1596499 -Ref: Ordinal Functions-Footnote-2596751 -Node: Join Function596960 -Ref: Join Function-Footnote-1598731 -Node: Gettimeofday Function598931 -Node: Data File Management602646 -Node: Filetrans Function603278 -Node: Rewind Function607514 -Node: File Checking608967 -Node: Empty Files610061 -Node: Ignoring Assigns612291 -Node: Getopt Function613844 -Ref: Getopt Function-Footnote-1625148 -Node: Passwd Functions625351 -Ref: Passwd Functions-Footnote-1634326 -Node: Group Functions634414 -Node: Walking Arrays642498 -Node: Sample Programs644067 -Node: Running Examples644732 -Node: Clones645460 -Node: Cut Program646684 -Node: Egrep Program656533 -Ref: Egrep Program-Footnote-1664304 -Node: Id Program664414 -Node: Split Program668030 -Ref: Split Program-Footnote-1671549 -Node: Tee Program671677 -Node: Uniq Program674480 -Node: Wc Program681903 -Ref: Wc Program-Footnote-1686167 -Node: Miscellaneous Programs686367 -Node: Dupword Program687555 -Node: Alarm Program689586 -Node: Translate Program694343 -Ref: Translate Program-Footnote-1698722 -Ref: Translate Program-Footnote-2698950 -Node: Labels Program699084 -Ref: Labels Program-Footnote-1702455 -Node: Word Sorting702539 -Node: History Sorting706422 -Node: Extract Program708260 -Ref: Extract Program-Footnote-1715741 -Node: Simple Sed715869 -Node: Igawk Program718931 -Ref: Igawk Program-Footnote-1733963 -Ref: Igawk Program-Footnote-2734164 -Node: Anagram Program734302 -Node: Signature Program737400 -Node: Debugger738503 -Node: Debugging739414 -Node: Debugging Concepts739728 -Node: Debugging Terms741584 -Node: Awk Debugging744129 -Node: Sample dgawk session745021 -Node: dgawk invocation745513 -Node: Finding The Bug746695 -Node: List of Debugger Commands753180 -Node: Breakpoint Control754491 -Node: Dgawk Execution Control757967 -Node: Viewing And Changing Data761318 -Node: Dgawk Stack764627 -Node: Dgawk Info766087 -Node: Miscellaneous Dgawk Commands770035 -Node: Readline Support775466 -Node: Dgawk Limitations776293 -Node: Language History778432 -Node: V7/SVR3.1779864 -Node: SVR4782159 -Node: POSIX783601 -Node: BTL784599 -Node: POSIX/GNU785333 -Node: Common Extensions790519 -Node: Contributors791620 -Node: Installation795655 -Node: Gawk Distribution796549 -Node: Getting797033 -Node: Extracting797859 -Node: Distribution contents799550 -Node: Unix Installation804568 -Node: Quick Installation805185 -Node: Additional Configuration Options807147 -Node: Configuration Philosophy808624 -Node: Non-Unix Installation810966 -Node: PC Installation811424 -Node: PC Binary Installation812723 -Node: PC Compiling814571 -Node: PC Testing817515 -Node: PC Using818691 -Node: Cygwin822876 -Node: MSYS823873 -Node: VMS Installation824387 -Node: VMS Compilation824993 -Ref: VMS Compilation-Footnote-1826000 -Node: VMS Installation Details826058 -Node: VMS Running827693 -Node: VMS Old Gawk829300 -Node: Bugs829774 -Node: Other Versions833639 -Node: Notes838918 -Node: Compatibility Mode839610 -Node: Additions840393 -Node: Accessing The Source841205 -Node: Adding Code842628 -Node: New Ports848176 -Node: Dynamic Extensions852289 -Node: Internals853665 -Node: Plugin License862781 -Node: Sample Library863415 -Node: Internal File Description864101 -Node: Internal File Ops867808 -Ref: Internal File Ops-Footnote-1872576 -Node: Using Internal File Ops872724 -Node: Future Extensions875101 -Node: Basic Concepts877605 -Node: Basic High Level878362 -Ref: Basic High Level-Footnote-1882397 -Node: Basic Data Typing882582 -Node: Floating Point Issues887107 -Node: String Conversion Precision888190 -Ref: String Conversion Precision-Footnote-1889884 -Node: Unexpected Results889993 -Node: POSIX Floating Point Problems891819 -Ref: POSIX Floating Point Problems-Footnote-1895521 -Node: Glossary895559 -Node: Copying919702 -Node: GNU Free Documentation License957259 -Node: Index982396 +Node: Reading Files162901 +Node: Records164842 +Ref: Records-Footnote-1173516 +Node: Fields173553 +Ref: Fields-Footnote-1176586 +Node: Nonconstant Fields176672 +Node: Changing Fields178874 +Node: Field Separators184852 +Node: Default Field Splitting187481 +Node: Regexp Field Splitting188598 +Node: Single Character Fields191940 +Node: Command Line Field Separator192999 +Node: Field Splitting Summary196440 +Ref: Field Splitting Summary-Footnote-1199632 +Node: Constant Size199733 +Node: Splitting By Content204317 +Ref: Splitting By Content-Footnote-1208043 +Node: Multiple Line208083 +Ref: Multiple Line-Footnote-1213930 +Node: Getline214109 +Node: Plain Getline216337 +Node: Getline/Variable218426 +Node: Getline/File219567 +Node: Getline/Variable/File220889 +Ref: Getline/Variable/File-Footnote-1222488 +Node: Getline/Pipe222575 +Node: Getline/Variable/Pipe225135 +Node: Getline/Coprocess226242 +Node: Getline/Variable/Coprocess227485 +Node: Getline Notes228199 +Node: Getline Summary230141 +Ref: table-getline-variants230484 +Node: Command line directories231340 +Node: Printing231965 +Node: Print233596 +Node: Print Examples234933 +Node: Output Separators237717 +Node: OFMT239477 +Node: Printf240835 +Node: Basic Printf241741 +Node: Control Letters243280 +Node: Format Modifiers247092 +Node: Printf Examples253101 +Node: Redirection255816 +Node: Special Files262800 +Node: Special FD263333 +Ref: Special FD-Footnote-1266957 +Node: Special Network267031 +Node: Special Caveats267881 +Node: Close Files And Pipes268677 +Ref: Close Files And Pipes-Footnote-1275700 +Ref: Close Files And Pipes-Footnote-2275848 +Node: Expressions275998 +Node: Values277067 +Node: Constants277743 +Node: Scalar Constants278423 +Ref: Scalar Constants-Footnote-1279282 +Node: Nondecimal-numbers279464 +Node: Regexp Constants282523 +Node: Using Constant Regexps282998 +Node: Variables286053 +Node: Using Variables286708 +Node: Assignment Options288432 +Node: Conversion290304 +Ref: table-locale-affects295680 +Ref: Conversion-Footnote-1296304 +Node: All Operators296413 +Node: Arithmetic Ops297043 +Node: Concatenation299548 +Ref: Concatenation-Footnote-1302341 +Node: Assignment Ops302461 +Ref: table-assign-ops307449 +Node: Increment Ops308857 +Node: Truth Values and Conditions312327 +Node: Truth Values313410 +Node: Typing and Comparison314459 +Node: Variable Typing315248 +Ref: Variable Typing-Footnote-1319145 +Node: Comparison Operators319267 +Ref: table-relational-ops319677 +Node: POSIX String Comparison323226 +Ref: POSIX String Comparison-Footnote-1324182 +Node: Boolean Ops324320 +Ref: Boolean Ops-Footnote-1328398 +Node: Conditional Exp328489 +Node: Function Calls330221 +Node: Precedence333815 +Node: Patterns and Actions337468 +Node: Pattern Overview338522 +Node: Regexp Patterns340188 +Node: Expression Patterns340731 +Node: Ranges344305 +Node: BEGIN/END347271 +Node: Using BEGIN/END348033 +Ref: Using BEGIN/END-Footnote-1350764 +Node: I/O And BEGIN/END350870 +Node: BEGINFILE/ENDFILE353152 +Node: Empty355983 +Node: Using Shell Variables356299 +Node: Action Overview358584 +Node: Statements360941 +Node: If Statement362795 +Node: While Statement364294 +Node: Do Statement366338 +Node: For Statement367494 +Node: Switch Statement370646 +Node: Break Statement372743 +Node: Continue Statement374733 +Node: Next Statement376520 +Node: Nextfile Statement378910 +Node: Exit Statement381386 +Node: Built-in Variables383802 +Node: User-modified384897 +Ref: User-modified-Footnote-1392923 +Node: Auto-set392985 +Ref: Auto-set-Footnote-1403716 +Node: ARGC and ARGV403921 +Node: Arrays407772 +Node: Array Basics409343 +Node: Array Intro410054 +Node: Reference to Elements414372 +Node: Assigning Elements416642 +Node: Array Example417133 +Node: Scanning an Array418865 +Node: Delete424132 +Ref: Delete-Footnote-1426567 +Node: Numeric Array Subscripts426624 +Node: Uninitialized Subscripts428807 +Node: Multi-dimensional430435 +Node: Multi-scanning433526 +Node: Array Sorting435110 +Ref: Array Sorting-Footnote-1438204 +Node: Arrays of Arrays438398 +Node: Functions442971 +Node: Built-in443793 +Node: Calling Built-in444871 +Node: Numeric Functions446859 +Ref: Numeric Functions-Footnote-1450624 +Ref: Numeric Functions-Footnote-2450981 +Ref: Numeric Functions-Footnote-3451029 +Node: String Functions451298 +Ref: String Functions-Footnote-1473800 +Ref: String Functions-Footnote-2473929 +Ref: String Functions-Footnote-3474177 +Node: Gory Details474264 +Ref: table-sub-escapes475943 +Ref: table-posix-sub477257 +Ref: table-gensub-escapes478170 +Node: I/O Functions479341 +Ref: I/O Functions-Footnote-1485996 +Node: Time Functions486143 +Ref: Time Functions-Footnote-1497038 +Ref: Time Functions-Footnote-2497106 +Ref: Time Functions-Footnote-3497264 +Ref: Time Functions-Footnote-4497375 +Ref: Time Functions-Footnote-5497487 +Ref: Time Functions-Footnote-6497714 +Node: Bitwise Functions497980 +Ref: table-bitwise-ops498538 +Ref: Bitwise Functions-Footnote-1502698 +Node: Type Functions502882 +Node: I18N Functions503352 +Node: User-defined504979 +Node: Definition Syntax505783 +Ref: Definition Syntax-Footnote-1510693 +Node: Function Example510762 +Node: Function Caveats513356 +Node: Calling A Function513777 +Node: Variable Scope514892 +Node: Pass By Value/Reference516867 +Node: Return Statement520307 +Node: Dynamic Typing523288 +Node: Indirect Calls524023 +Node: Internationalization533708 +Node: I18N and L10N535134 +Node: Explaining gettext535820 +Ref: Explaining gettext-Footnote-1540886 +Ref: Explaining gettext-Footnote-2541070 +Node: Programmer i18n541235 +Node: Translator i18n545435 +Node: String Extraction546228 +Ref: String Extraction-Footnote-1547189 +Node: Printf Ordering547275 +Ref: Printf Ordering-Footnote-1550059 +Node: I18N Portability550123 +Ref: I18N Portability-Footnote-1552572 +Node: I18N Example552635 +Ref: I18N Example-Footnote-1555270 +Node: Gawk I18N555342 +Node: Advanced Features555959 +Node: Nondecimal Data557278 +Node: Two-way I/O558859 +Ref: Two-way I/O-Footnote-1564293 +Node: TCP/IP Networking564363 +Node: Profiling567207 +Node: Library Functions574681 +Ref: Library Functions-Footnote-1577786 +Node: Library Names577957 +Ref: Library Names-Footnote-1581428 +Ref: Library Names-Footnote-2581648 +Node: General Functions581734 +Node: Nextfile Function582797 +Node: Strtonum Function587178 +Node: Assert Function590134 +Node: Round Function593460 +Node: Cliff Random Function595003 +Node: Ordinal Functions596019 +Ref: Ordinal Functions-Footnote-1599089 +Ref: Ordinal Functions-Footnote-2599341 +Node: Join Function599550 +Ref: Join Function-Footnote-1601321 +Node: Gettimeofday Function601521 +Node: Data File Management605236 +Node: Filetrans Function605868 +Node: Rewind Function610104 +Node: File Checking611557 +Node: Empty Files612651 +Node: Ignoring Assigns614881 +Node: Getopt Function616434 +Ref: Getopt Function-Footnote-1627738 +Node: Passwd Functions627941 +Ref: Passwd Functions-Footnote-1636916 +Node: Group Functions637004 +Node: Walking Arrays645088 +Node: Sample Programs646657 +Node: Running Examples647322 +Node: Clones648050 +Node: Cut Program649274 +Node: Egrep Program659123 +Ref: Egrep Program-Footnote-1666894 +Node: Id Program667004 +Node: Split Program670620 +Ref: Split Program-Footnote-1674139 +Node: Tee Program674267 +Node: Uniq Program677070 +Node: Wc Program684493 +Ref: Wc Program-Footnote-1688757 +Node: Miscellaneous Programs688957 +Node: Dupword Program690145 +Node: Alarm Program692176 +Node: Translate Program696933 +Ref: Translate Program-Footnote-1701312 +Ref: Translate Program-Footnote-2701540 +Node: Labels Program701674 +Ref: Labels Program-Footnote-1705045 +Node: Word Sorting705129 +Node: History Sorting709012 +Node: Extract Program710850 +Ref: Extract Program-Footnote-1718331 +Node: Simple Sed718459 +Node: Igawk Program721521 +Ref: Igawk Program-Footnote-1736553 +Ref: Igawk Program-Footnote-2736754 +Node: Anagram Program736892 +Node: Signature Program739990 +Node: Debugger741093 +Node: Debugging742004 +Node: Debugging Concepts742318 +Node: Debugging Terms744174 +Node: Awk Debugging746719 +Node: Sample dgawk session747611 +Node: dgawk invocation748103 +Node: Finding The Bug749285 +Node: List of Debugger Commands755770 +Node: Breakpoint Control757081 +Node: Dgawk Execution Control760557 +Node: Viewing And Changing Data763908 +Node: Dgawk Stack767217 +Node: Dgawk Info768677 +Node: Miscellaneous Dgawk Commands772625 +Node: Readline Support778056 +Node: Dgawk Limitations778883 +Node: Language History781022 +Node: V7/SVR3.1782454 +Node: SVR4784749 +Node: POSIX786191 +Node: BTL787189 +Node: POSIX/GNU787923 +Node: Common Extensions793109 +Node: Contributors794210 +Node: Installation798245 +Node: Gawk Distribution799139 +Node: Getting799623 +Node: Extracting800449 +Node: Distribution contents802140 +Node: Unix Installation807158 +Node: Quick Installation807775 +Node: Additional Configuration Options809737 +Node: Configuration Philosophy811214 +Node: Non-Unix Installation813556 +Node: PC Installation814014 +Node: PC Binary Installation815313 +Node: PC Compiling817161 +Node: PC Testing820105 +Node: PC Using821281 +Node: Cygwin825466 +Node: MSYS826463 +Node: VMS Installation826977 +Node: VMS Compilation827583 +Ref: VMS Compilation-Footnote-1828590 +Node: VMS Installation Details828648 +Node: VMS Running830283 +Node: VMS Old Gawk831890 +Node: Bugs832364 +Node: Other Versions836229 +Node: Notes841508 +Node: Compatibility Mode842200 +Node: Additions842983 +Node: Accessing The Source843795 +Node: Adding Code845218 +Node: New Ports850766 +Node: Dynamic Extensions854879 +Node: Internals856255 +Node: Plugin License865371 +Node: Sample Library866005 +Node: Internal File Description866691 +Node: Internal File Ops870398 +Ref: Internal File Ops-Footnote-1875166 +Node: Using Internal File Ops875314 +Node: Future Extensions877691 +Node: Basic Concepts880195 +Node: Basic High Level880952 +Ref: Basic High Level-Footnote-1884987 +Node: Basic Data Typing885172 +Node: Floating Point Issues889697 +Node: String Conversion Precision890780 +Ref: String Conversion Precision-Footnote-1892474 +Node: Unexpected Results892583 +Node: POSIX Floating Point Problems894409 +Ref: POSIX Floating Point Problems-Footnote-1898111 +Node: Glossary898149 +Node: Copying922292 +Node: GNU Free Documentation License959849 +Node: Index984986 End Tag Table diff --git a/doc/gawk.texi b/doc/gawk.texi index 7c63476f..0b410fc1 100644 --- a/doc/gawk.texi +++ b/doc/gawk.texi @@ -5138,7 +5138,7 @@ will give you much better performance when reading records. Otherwise, @command{gawk} has to make several function calls, @emph{per input character}, to find the record terminator. -According to POSIX, string conmparison is also affected by locales +According to POSIX, string comparison is also affected by locales (similar to regular expressions). The details are presented in @ref{POSIX String Comparison}. @@ -12756,17 +12756,30 @@ The parent process ID of the current process. @item PROCINFO["sorted_in"] If this element exists in @code{PROCINFO}, its value controls the order in which array indices will be processed by -@samp{for(i in arr) @dots{}} loops. -A value of @code{"ascending index string"}, which may be shortened to -@code{"ascending index"} or just @code{"ascending"}, will result in either -case sensitive or case insensitive ascending order depending upon -the value of @code{IGNORECASE}. -A value of @code{"descending index string"}, which may be shortened in -a similar manner, will result in the opposite order. -The value @code{"unsorted"} is also recognized, yielding the default -result of arbitrary order. Any other value will be ignored, and -warned about (at the time of first @samp{for(in in arr) @dots{}} -execution) when lint checking is enabled. +@samp{for (index in array) @dots{}} loops. +The value should contain one to three words; separate pairs of words +by a single space. +One word controls sort direction, ``ascending'' or ``descending;'' +another controls the sort key, ``index'' or ``value;'' and the remaining +one, which is only valid for sorting by index, is comparison mode, +``string'' or ``number.'' When two or three words are present, they may +be specified in any order, so @samp{ascending index string} and +@samp{string ascending index} are equivalent. Also, each word may +be truncated, so @samp{asc index str} and @samp{a i s} are also +equivalent. Note that a separating space is required even when the +words have been shortened down to one letter each. + +You can omit direction and/or key type and/or comparison mode. Provided +that at least one is present, missing parts of a sort specification +default to @samp{ascending}, @samp{index}, and (for indices only) @samp{string}, +respectively. +An empty string, @code{""}, is the same as @samp{unsorted} and will cause +@samp{for (index in array) @dots{}} to process the indices in +arbitrary order. Another thing to note is that the array sorting +takes place at the time @samp{for (@dots{} in @dots{})} is about to +start executing, so changing the value of @code{PROCINFO["sorted_in"]} +during loop execution does not have any effect on the order in which any +remaining array elements get processed. @xref{Scanning an Array}, for more information. @item PROCINFO["strftime"] @@ -13439,14 +13452,43 @@ strange results. It is best to avoid such things. As an extension, @command{gawk} makes it possible for you to loop over the elements of an array in order, based on the value of @code{PROCINFO["sorted_in"]} (@pxref{Auto-set}). -At present two sorting options are available: @code{"ascending -index string"} and @code{"descending index string"}. They can be -shortened by omitting @samp{string} or @samp{index string}. The value -@code{"unsorted"} can be used as an explicit ``no-op'' and yields the same -result as when @code{PROCINFO["sorted_in"]} has no value at all. If the -index strings contain letters, the value of @code{IGNORECASE} affects -the order of the result. This extension is disabled in POSIX mode, -since the @code{PROCINFO} array is not special in that case. For example: +Several sorting options are available: + +@table @code +@item "ascending index string" +Order by indices compared as strings, the most basic sort. +(Internally, array indices are always strings, so with @code{a[2*5] = 1} +the index is actually @code{"10"} rather than numeric 10.) + +@item "ascending index number" +Order by indices but force them to be treated as numbers in the process. +Any index with non-numeric value will end up positioned as if it were 0. + +@item "ascending value" +Order by element values rather than by indices. Comparisons are done +as numeric when both values being compared are numeric, or done as +strings when either or both aren't numeric. Sub-arrays, if present, +come out last. + +@item "descending index string" +Reverse order from the most basic sort. + +@item "descending index number" +Numeric indices ordered from high to low. + +@item "descending value" +Element values ordered from high to low. Sub-arrays, if present, +come out first. + +@item "unsorted" +Array elements are processed in arbitrary order, the normal @command{awk} +behavior. +@end table + +Portions of the sort specification string may be truncated or omitted. +The default is @samp{ascending} for direction, @samp{index} for sort key type, +and (when sorting by index only) @samp{string} for comparison mode. +For example: @example $ @kbd{gawk 'BEGIN @{} @@ -13458,7 +13500,7 @@ $ @kbd{gawk 'BEGIN @{} @print{} 4 4 @print{} 3 3 $ @kbd{gawk 'BEGIN @{} -> @kbd{ PROCINFO["sorted_in"] = "ascending index"} +> @kbd{ PROCINFO["sorted_in"] = "asc index"} > @kbd{ a[4] = 4} > @kbd{ a[3] = 3} > @kbd{ for (i in a)} @@ -13476,6 +13518,26 @@ sorted array traversal is not the default. @c maintainers believe that only the people who wish to use a @c feature should have to pay for it. +When sorting an array by element values, if a value happens to be +a sub-array then it is considered to be greater than any string or +numeric value, regardless of what the sub-array itself contains, +and all sub-arrays are treated as being equal to each other. Their +order relative to each other is determined by their index strings. + +Sorting by array element values (for values other than sub-arrays) +always uses basic @command{awk} comparison mode: if both values +happen to be numbers then they're compared as numbers, otherwise +they're compared as strings. + +When string comparisons are made during a sort, either for element +values where one or both aren't numbers or for element indices +handled as strings, the value of @code{IGNORECASE} controls whether +the comparisons treat corresponding upper and lower case letters as +equivalent or distinct. + +This sorting extension is disabled in POSIX mode, +since the @code{PROCINFO} array is not special in that case. + @node Delete @section The @code{delete} Statement @cindex @code{delete} statement @@ -1055,254 +1055,6 @@ update_FNR() } -typedef int (*qsort_compfunc)(const void *,const void *); - -static qsort_compfunc sorted_in(void); -static int sort_ignrcase(const char *, size_t,const char *,size_t); -static int sort_up_index_str(const void *, const void *); -static int sort_down_index_str(const void *, const void *); -static int sort_up_index_ignrcase(const void *, const void *); -static int sort_down_index_ignrcase(const void *, const void *); - -/* comp_func --- array index comparison function for qsort, used in debug.c */ - -int -comp_func(const void *p1, const void *p2) -{ - return sort_up_index_str(p1, p2); -} - -/* sort_ignorecase --- case insensitive string comparison from cmp_nodes() */ - -static int -sort_ignorecase(const char *s1, size_t l1, const char *s2, size_t l2) -{ - size_t l; - int ret; - - l = (l1 < l2) ? l1 : l2; -#ifdef MBS_SUPPORT - if (gawk_mb_cur_max > 1) { - ret = strncasecmpmbs((const unsigned char *) s1, - (const unsigned char *) s2, l); - } else -#endif - for (ret = 0; l-- > 0 && ret == 0; s1++, s2++) - ret = casetable[*(unsigned char *) s1] - - casetable[*(unsigned char *) s2]; - if (ret == 0 && l1 != l2) - ret = (l1 < l2) ? -1 : 1; - return ret; -} - -/* - * sort_up_index_str --- qsort comparison function; ascending index strings; - * index strings are distinct within an array, so no comparisons will yield - * equal and warrant disambiguation - */ - -static int -sort_up_index_str(const void *p1, const void *p2) -{ - const NODE *t1, *t2; - const char *str1, *str2; - size_t len1, len2; - int ret; - - /* Array indexes are strings and distinct, never equal */ - t1 = *((const NODE *const *) p1); - t2 = *((const NODE *const *) p2); - - str1 = t1->ahname_str; - len1 = t1->ahname_len; - str2 = t2->ahname_str; - len2 = t2->ahname_len; - - ret = memcmp(str1, str2, (len1 < len2) ? len1 : len2); - /* - * if they compared equal but their lengths differ, the - * shorter one is a prefix of the longer and compares as less - */ - if (ret == 0 && len1 != len2) - ret = (len1 < len2) ? -1 : 1; - - /* indices are unique within an array, so result should never be 0 */ - assert(ret != 0); - return ret; -} - -/* sort_down_index_str --- descending index strings */ - -static int -sort_down_index_str(const void *p1, const void *p2) -{ - /* - * Negation versus transposed arguments: when all keys are - * distinct, as with array indices here, either method will - * transform an ascending sort into a descending one. But if - * there are equal keys--such as when IGNORECASE is honored-- - * that get disambiguated into a determisitc order, negation - * will reverse those but transposed arguments would retain - * their relative order within the rest of the reversed sort. - */ - return -sort_up_index_str(p1, p2); -} - -/* - * sort_up_index_ignrcase --- ascending index string, case insensitive; - * case insensitive comparison can cause distinct index strings to compare - * equal, so disambiguation in necessary - */ - -static int -sort_up_index_ignrcase(const void *p1, const void *p2) -{ - const NODE *t1, *t2; - int ret; - - t1 = *((const NODE *const *) p1); - t2 = *((const NODE *const *) p2); - - ret = sort_ignorecase(t1->ahname_str, t1->ahname_len, - t2->ahname_str, t2->ahname_len); - - /* - * if case insensitive result is "they're the same", - * use case sensitive comparison to force distinct order - */ - if (ret == 0) - ret = sort_up_index_str(p1, p2); - return ret; -} - -/* sort_down_index_ignrcase --- descending index strings, case insensitive */ - -static int -sort_down_index_ignrcase(const void *p1, const void *p2) -{ - return -sort_up_index_ignrcase(p1, p2); -} - -struct sort_option { - qsort_compfunc func; - const char *keydescr; -}; - -/* - * sorted_in --- fetch and parse value of PROCINFO["sorted_in"] to decide - * whether to sort the traversal of ``for (index in array) {}'', and - * if so, what ordering to generate; returns a qsort comparison function - */ - -static qsort_compfunc -sorted_in(void) -{ - static struct sort_option sorts[] = { - /* explicit no-op */ - { (qsort_compfunc) NULL, "unsorted" }, - /* also matches "ascending index" and "ascending" */ - { sort_up_index_str, "ascending index string" }, - /* also matches "descending index" and "descending" */ - { sort_down_index_str, "descending index string" }, - /* - * to come - */ - /* also matches "ascending value"; "ascending" won't get here */ - /* { sort_up_value_str, "ascending value string" }, - * { sort_down_value_str, "descending value string" }, - * { sort_up_index_num, "ascending index number" }, - * { sort_down_index_num, "descending index number" }, - * { sort_up_value_num, "ascending value number" }, - * { sort_down_value_num, "descending value number" }, - * and possibly - * { sort_as_inserted, "insertion-order" }, - */ - }; - static NODE *sorted_str = NULL; - static short warned_extension = FALSE, warned_unrecognized = FALSE; - - NODE *r; - const char *s, *descr; - size_t i, k, l; - short is_number; - qsort_compfunc sort_func; - - /* if there's no PROCINFO[], there can be no ["sorted_in"], so no sorting */ - if (PROCINFO_node == NULL) - return (qsort_compfunc) NULL; - - if (sorted_str == NULL) /* do this once */ - sorted_str = make_string("sorted_in", 9); - - r = (NODE *) NULL; - if (in_array(PROCINFO_node, sorted_str)) - r = *assoc_lookup(PROCINFO_node, sorted_str, TRUE); - /* if there's no PROCINFO["sorted_in"], there's no sorting */ - if (!r) - return (qsort_compfunc) 0; - - /* we're going to make use of PROCINFO["sorted_in"] */ - if (do_lint && ! warned_extension) { - warned_extension = TRUE; - lintwarn(_("sorted array traversal is a gawk extension")); - } - - /* default result is no sorting */ - sort_func = (qsort_compfunc) NULL; - - /* undocumented synonyms: 0 is "unsorted", 1 is "ascending index" */ - if (r->flags & MAYBE_NUM) - (void) force_number(r); - is_number = ((r->flags & NUMBER) != 0); - if (is_number) { - if (r->numbr == 1) - sort_func = sort_up_index_str; - if (r->numbr == 0 || r->numbr == 1) - goto got_func; - /* - * using PROCINFO["sorted_in"] as a number is not a general - * index into sorts[]; entries beyond [1] may get reordered - */ - } - - r = force_string(r); - s = r->stptr; - l = r->stlen; - while (l > 0 && *s == ' ') - ++s, --l; - - /* treat empty string the same as 0, a synonym for no sorting */ - if (l == 0) - goto got_func; /* sort_func is still 0 */ - - /* scan the list of available sorting orders */ - for (i = 0; i < (sizeof sorts / sizeof *sorts); ++i) { - descr = sorts[i].keydescr; - if (((k = strlen(descr)) == l || (k > l && descr[l] == ' ')) - && !strncasecmp(s, descr, l)) { - sort_func = sorts[i].func; - goto got_func; - } - } - - /* we didn't match any key description; sort_func is still 0 */ - if ((do_lint || is_number) && ! warned_unrecognized) { - warned_unrecognized = TRUE; - lintwarn(_("`PROCINFO[\"sorted_in\"]' value is not recognized")); - } - -got_func: - if (IGNORECASE) { - if (sort_func == sort_up_index_str) - sort_func = sort_up_index_ignrcase; - if (sort_func == sort_down_index_str) - sort_func = sort_down_index_ignrcase; - } - - return sort_func; -} - - NODE *frame_ptr; /* current frame */ STACK_ITEM *stack_ptr = NULL; @@ -2444,9 +2196,19 @@ post: } sort_compare = sorted_in(); - if (sort_compare) + if (sort_compare) { + /* ordering might be "* index number" + in which case we want more than just + a simple array of element nodes */ + sort_maybe_numeric_index(sort_compare, list, + num_elems, TRUE); + /* sort the array of element pointers */ qsort(list, num_elems, sizeof(NODE *), sort_compare); /* shazzam! */ + /* clean up by-index as-number */ + sort_maybe_numeric_index(sort_compare, list, + num_elems, FALSE); + } list[num_elems] = array; /* actual array for use in * lint warning in Op_arrayfor_incr */ |