From 1fea520248b42ca995c8797698c60301ea42ffe9 Mon Sep 17 00:00:00 2001 From: john haque Date: Sat, 20 Aug 2011 08:28:02 -0500 Subject: Speed/memory performance improvements. --- str_array.c | 759 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 759 insertions(+) create mode 100644 str_array.c (limited to 'str_array.c') diff --git a/str_array.c b/str_array.c new file mode 100644 index 00000000..330280a6 --- /dev/null +++ b/str_array.c @@ -0,0 +1,759 @@ +/* + * str_array.c - routines for associative arrays of string indices. + */ + +/* + * Copyright (C) 1986, 1988, 1989, 1991-2011 the Free Software Foundation, Inc. + * + * This file is part of GAWK, the GNU implementation of the + * AWK Programming Language. + * + * GAWK is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * GAWK is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + +#include "awk.h" + +/* + * Tree walks (``for (iggy in foo)'') and array deletions use expensive + * linear searching. So what we do is start out with small arrays and + * grow them as needed, so that our arrays are hopefully small enough, + * most of the time, that they're pretty full and we're not looking at + * wasted space. + * + * The decision is made to grow the array if the average chain length is + * ``too big''. This is defined as the total number of entries in the table + * divided by the size of the array being greater than some constant. + * + * 11/2002: We make the constant a variable, so that it can be tweaked + * via environment variable. + * 11/2002: Modern machines are bigger, cut this down from 10. + */ + +static size_t STR_CHAIN_MAX = 2; + +extern FILE *output_fp; +extern void indent(int indent_level); + +static NODE **str_array_init(NODE *symbol, NODE *subs); +static NODE **str_lookup(NODE *symbol, NODE *subs); +static NODE **str_exists(NODE *symbol, NODE *subs); +static NODE **str_clear(NODE *symbol, NODE *subs); +static NODE **str_remove(NODE *symbol, NODE *subs); +static NODE **str_list(NODE *symbol, NODE *subs); +static NODE **str_copy(NODE *symbol, NODE *newsymb); +static NODE **str_dump(NODE *symbol, NODE *ndump); + +#ifdef ARRAYDEBUG +static NODE **str_option(NODE *opt, NODE *val); +#endif + + +array_ptr str_array_func[] = { + str_array_init, + (array_ptr) 0, + str_lookup, + str_exists, + str_clear, + str_remove, + str_list, + str_copy, + str_dump, +#ifdef ARRAYDEBUG + str_option +#endif +}; + +static inline NODE **str_find(NODE *symbol, NODE *s1, size_t code1, unsigned long hash1); +static void grow_table(NODE *symbol); + +static unsigned long gst_hash_string(const char *str, size_t len, unsigned long hsize, size_t *code); +static unsigned long scramble(unsigned long x); +static unsigned long awk_hash(const char *s, size_t len, unsigned long hsize, size_t *code); + +unsigned long (*hash)(const char *s, size_t len, unsigned long hsize, size_t *code) = awk_hash; + + +/* str_array_init --- check relevant environment variables */ + +static NODE ** +str_array_init(NODE *symbol ATTRIBUTE_UNUSED, NODE *subs ATTRIBUTE_UNUSED) +{ + long newval; + const char *val; + + if ((newval = getenv_long("STR_CHAIN_MAX")) > 0) + STR_CHAIN_MAX = newval; + if ((val = getenv("AWK_HASH")) != NULL && strcmp(val, "gst") == 0) + hash = gst_hash_string; + return (NODE **) ! NULL; +} + + +/* + * assoc_lookup: + * Find SYMBOL[SUBS] in the assoc array. Install it with value "" if it + * isn't there. Returns a pointer ala get_lhs to where its value is stored. + * + * SYMBOL is the address of the node (or other pointer) being dereferenced. + * SUBS is a number or string used as the subscript. + */ + +static NODE ** +str_lookup(NODE *symbol, NODE *subs) +{ + unsigned long hash1; + NODE **lhs; + BUCKET *b; + size_t code1; + + subs = force_string(subs); + + if (symbol->buckets == NULL) + grow_table(symbol); + hash1 = hash(subs->stptr, subs->stlen, + (unsigned long) symbol->array_size, & code1); + if ((lhs = str_find(symbol, subs, code1, hash1)) != NULL) + return lhs; + + /* It's not there, install it. */ + /* first see if we would need to grow the array, before installing */ + + symbol->table_size++; + if ((symbol->flags & ARRAYMAXED) == 0 + && (symbol->table_size / symbol->array_size) > STR_CHAIN_MAX) { + grow_table(symbol); + /* have to recompute hash value for new size */ + hash1 = code1 % (unsigned long) symbol->array_size; + } + + if (subs->stfmt != -1) { + NODE *tmp; + + /* + * Need to freeze this string value --- it must never + * change, no matter what happens to the value + * that created it or to CONVFMT, etc.; So, get + * a private copy. + */ + + tmp = make_string(subs->stptr, subs->stlen); + + /* + * Set the numeric value for the index if it's available. Useful + * for numeric sorting by index. Do this only if the numeric + * value is available, instead of all the time, since doing it + * all the time is a big performance hit for something that may + * never be used. + */ + + if (subs->flags & NUMCUR) { + tmp->numbr = subs->numbr; + tmp->flags |= NUMCUR; + } + subs = tmp; + } else { + /* string value already "frozen" */ + + subs = dupnode(subs); + } + + getbucket(b); + b->ahnext = symbol->buckets[hash1]; + symbol->buckets[hash1] = b; + b->ahname = subs; + b->ahname_str = subs->stptr; + b->ahname_len = subs->stlen; + b->ahvalue = dupnode(Nnull_string); + b->ahcode = code1; + return & (b->ahvalue); +} + +/* str_exists --- test whether the array element symbol[subs] exists or not, + * return pointer to value if it does. + */ + +static NODE ** +str_exists(NODE *symbol, NODE *subs) +{ + NODE **lhs; + unsigned long hash1; + size_t code1; + + assert(symbol->table_size > 0); + + subs = force_string(subs); + hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size, & code1); + lhs = str_find(symbol, subs, code1, hash1); + return lhs; +} + +/* str_clear --- flush all the values in symbol[] */ + +static NODE ** +str_clear(NODE *symbol, NODE *subs ATTRIBUTE_UNUSED) +{ + unsigned long i; + BUCKET *b, *next; + NODE *r; + + for (i = 0; i < symbol->array_size; i++) { + for (b = symbol->buckets[i]; b != NULL; b = next) { + next = b->ahnext; + r = b->ahvalue; + if (r->type == Node_var_array) { + assoc_clear(r); /* recursively clear all sub-arrays */ + efree(r->vname); + freenode(r); + } else + unref(r); + unref(b->ahname); + freebucket(b); + } + symbol->buckets[i] = NULL; + } + + if (symbol->buckets != NULL) + efree(symbol->buckets); + init_array(symbol); /* re-initialize symbol */ + symbol->flags &= ~ARRAYMAXED; + return NULL; +} + + +/* str_remove --- If SUBS is already in the table, remove it. */ + +static NODE ** +str_remove(NODE *symbol, NODE *subs) +{ + unsigned long hash1; + BUCKET *b, *prev; + NODE *s2; + size_t s1_len; + + assert(symbol->table_size > 0); + + s2 = force_string(subs); + hash1 = hash(s2->stptr, s2->stlen, (unsigned long) symbol->array_size, NULL); + + for (b = symbol->buckets[hash1], prev = NULL; b != NULL; + prev = b, b = b->ahnext) { + + /* Array indexes are strings; compare as such, always! */ + s1_len = b->ahname_len; + + if (s1_len != s2->stlen) + continue; + if (s1_len == 0 /* "" is a valid index */ + || memcmp(b->ahname_str, s2->stptr, s1_len) == 0) { + /* item found */ + + unref(b->ahname); + if (prev != NULL) + prev->ahnext = b->ahnext; + else + symbol->buckets[hash1] = b->ahnext; + + /* delete bucket */ + freebucket(b); + + /* one less element in array */ + if (--symbol->table_size == 0) { + if (symbol->buckets != NULL) + efree(symbol->buckets); + init_array(symbol); /* re-initialize symbol */ + symbol->flags &= ~ARRAYMAXED; + } + + return (NODE **) ! NULL; /* return success */ + } + } + + return NULL; +} + + +/* str_copy --- duplicate input array "symbol" */ + +static NODE ** +str_copy(NODE *symbol, NODE *newsymb) +{ + BUCKET **old, **new, **pnew; + BUCKET *chain, *newchain; + unsigned long cursize, i; + + assert(symbol->table_size > 0); + + /* find the current hash size */ + cursize = symbol->array_size; + + /* allocate new table */ + emalloc(new, BUCKET **, cursize * sizeof(BUCKET *), "str_copy"); + memset(new, '\0', cursize * sizeof(BUCKET *)); + + old = symbol->buckets; + + for (i = 0; i < cursize; i++) { + for (chain = old[i], pnew = & new[i]; chain != NULL; + chain = chain->ahnext + ) { + NODE *oldval, *newsubs; + + getbucket(newchain); + + /* + * copy the corresponding name and + * value from the original input list + */ + + newsubs = newchain->ahname = dupnode(chain->ahname); + newchain->ahname_str = newsubs->stptr; + newchain->ahname_len = newsubs->stlen; + + oldval = chain->ahvalue; + if (oldval->type == Node_val) + newchain->ahvalue = dupnode(oldval); + else { + NODE *r; + + r = make_array(); + r->vname = estrdup(oldval->vname, strlen(oldval->vname)); + r->parent_array = newsymb; + newchain->ahvalue = assoc_copy(oldval, r); + } + newchain->ahcode = chain->ahcode; + + *pnew = newchain; + pnew = & newchain->ahnext; + } + } + + newsymb->table_size = symbol->table_size; + newsymb->buckets = new; + newsymb->array_size = cursize; + newsymb->flags = symbol->flags; + return NULL; +} + + +/* str_list --- return a list of array items */ + +static NODE** +str_list(NODE *symbol, NODE *t) +{ + NODE **list; + NODE *subs, *val; + BUCKET *b; + unsigned long num_elems, list_size, i, k = 0; + int elem_size = 1; + + assert(symbol->table_size > 0); + + if ((t->flags & (AINDEX|AVALUE)) == (AINDEX|AVALUE)) + elem_size = 2; + + /* allocate space for array */ + num_elems = symbol->table_size; + if ((t->flags & (AINDEX|AVALUE|ADELETE)) == (AINDEX|ADELETE)) + num_elems = 1; + list_size = elem_size * num_elems; + + emalloc(list, NODE **, list_size * sizeof(NODE *), "str_list"); + + /* populate it */ + + for (i = 0; i < symbol->array_size; i++) { + for (b = symbol->buckets[i]; b != NULL; b = b->ahnext) { + /* index */ + subs = b->ahname; + if (t->flags & AINUM) + (void) force_number(subs); + list[k++] = dupnode(subs); + + /* value */ + if (t->flags & AVALUE) { + val = b->ahvalue; + if (val->type == Node_val) { + if ((t->flags & AVNUM) != 0) + (void) force_number(val); + else if ((t->flags & AVSTR) != 0) + val = force_string(val); + } + list[k++] = val; + } + if (k >= list_size) + return list; + } + } + return list; +} + + +/* str_kilobytes --- calculate memory consumption of the assoc array */ + +AWKNUM +str_kilobytes(NODE *symbol) +{ + unsigned long bucket_cnt; + AWKNUM kb; + + bucket_cnt = symbol->table_size; + + /* This does not include extra memory for indices with stfmt != -1 */ + kb = (((AWKNUM) bucket_cnt) * sizeof (BUCKET) + + ((AWKNUM) symbol->array_size) * sizeof (BUCKET *)) / 1024.0; + return kb; +} + + +/* str_dump --- dump array info */ + +static NODE ** +str_dump(NODE *symbol, NODE *ndump) +{ +#define HCNT 31 + + int indent_level; + unsigned long i, bucket_cnt; + BUCKET *b; + static size_t hash_dist[HCNT + 1]; + + indent_level = ndump->alevel; + + if ((symbol->flags & XARRAY) == 0) + fprintf(output_fp, "%s `%s'\n", + (symbol->parent_array == NULL) ? "array" : "sub-array", + array_vname(symbol)); + indent_level++; + indent(indent_level); + fprintf(output_fp, "array_func: str_array_func\n"); + if (symbol->flags != 0) { + indent(indent_level); + fprintf(output_fp, "flags: %s\n", flags2str(symbol->flags)); + } + indent(indent_level); + fprintf(output_fp, "STR_CHAIN_MAX: %d\n", STR_CHAIN_MAX); + indent(indent_level); + fprintf(output_fp, "array_size: %lu\n", (unsigned long) symbol->array_size); + indent(indent_level); + fprintf(output_fp, "table_size: %lu\n", (unsigned long) symbol->table_size); + indent(indent_level); + fprintf(output_fp, "Avg # of items per chain: %.2g\n", + ((AWKNUM) symbol->table_size) / symbol->array_size); + + indent(indent_level); + fprintf(output_fp, "memory: %.2g kB\n", str_kilobytes(symbol)); + + /* hash value distribution */ + + memset(hash_dist, '\0', (HCNT + 1) * sizeof(size_t)); + for (i = 0; i < symbol->array_size; i++) { + bucket_cnt = 0; + for (b = symbol->buckets[i]; b != NULL; b = b->ahnext) + bucket_cnt++; + if (bucket_cnt >= HCNT) + bucket_cnt = HCNT; + hash_dist[bucket_cnt]++; + } + + indent(indent_level); + fprintf(output_fp, "Hash distribution:\n"); + indent_level++; + for (i = 0; i <= HCNT; i++) { + if (hash_dist[i] > 0) { + indent(indent_level); + if (i == HCNT) + fprintf(output_fp, "[>=%lu]:%lu\n", + (unsigned long) HCNT, (unsigned long) hash_dist[i]); + else + fprintf(output_fp, "[%lu]:%lu\n", + (unsigned long) i, (unsigned long) hash_dist[i]); + } + } + indent_level--; + + /* dump elements */ + + if (ndump->adepth >= 0) { + const char *aname; + + fprintf(output_fp, "\n"); + aname = make_aname(symbol); + for (i = 0; i < symbol->array_size; i++) { + for (b = symbol->buckets[i]; b != NULL; b = b->ahnext) + assoc_info(b->ahname, b->ahvalue, ndump, aname); + } + } + + return NULL; + +#undef HCNT +} + + +/* awk_hash --- calculate the hash function of the string in subs */ + +static unsigned long +awk_hash(const char *s, size_t len, unsigned long hsize, size_t *code) +{ + unsigned long h = 0; + unsigned long htmp; + + /* + * Ozan Yigit's original sdbm hash, copied from Margo Seltzers + * db package. + * + * This is INCREDIBLY ugly, but fast. We break the string up into + * 8 byte units. On the first time through the loop we get the + * "leftover bytes" (strlen % 8). On every other iteration, we + * perform 8 HASHC's so we handle all 8 bytes. Essentially, this + * saves us 7 cmp & branch instructions. If this routine is + * heavily used enough, it's worth the ugly coding. + */ + + /* + * Even more speed: + * #define HASHC h = *s++ + 65599 * h + * Because 65599 = pow(2, 6) + pow(2, 16) - 1 we multiply by shifts + * + * 4/2011: Force the results to 32 bits, to get the same + * result on both 32- and 64-bit systems. This may be a + * bad idea. + */ +#define HASHC htmp = (h << 6); \ + h = *s++ + htmp + (htmp << 10) - h ; \ + htmp &= 0xFFFFFFFF; \ + h &= 0xFFFFFFFF + + h = 0; + + /* "Duff's Device" */ + if (len > 0) { + size_t loop = (len + 8 - 1) >> 3; + + switch (len & (8 - 1)) { + case 0: + do { /* All fall throughs */ + HASHC; + case 7: HASHC; + case 6: HASHC; + case 5: HASHC; + case 4: HASHC; + case 3: HASHC; + case 2: HASHC; + case 1: HASHC; + } while (--loop); + } + } + + if (code != NULL) + *code = h; + + if (h >= hsize) + h %= hsize; + return h; +} + + +/* str_find --- locate symbol[subs] */ + +static inline NODE ** +str_find(NODE *symbol, NODE *s1, size_t code1, unsigned long hash1) +{ + BUCKET *b; + size_t s2_len; + + for (b = symbol->buckets[hash1]; b != NULL; b = b->ahnext) { + /* + * This used to use cmp_nodes() here. That's wrong. + * Array indexes are strings; compare as such, always! + */ + s2_len = b->ahname_len; + + if (code1 == b->ahcode + && s1->stlen == s2_len + && (s2_len == 0 /* "" is a valid index */ + || memcmp(s1->stptr, b->ahname_str, s2_len) == 0) + ) + return & (b->ahvalue); + } + return NULL; +} + + +/* grow_table --- grow a hash table */ + +static void +grow_table(NODE *symbol) +{ + BUCKET **old, **new; + BUCKET *chain, *next; + int i, j; + unsigned long oldsize, newsize, k; + unsigned long hash1; + + /* + * This is an array of primes. We grow the table by an order of + * magnitude each time (not just doubling) so that growing is a + * rare operation. We expect, on average, that it won't happen + * more than twice. The final size is also chosen to be small + * enough so that MS-DOG mallocs can handle it. When things are + * very large (> 8K), we just double more or less, instead of + * just jumping from 8K to 64K. + */ + + static const unsigned long sizes[] = { + 13, 127, 1021, 8191, 16381, 32749, 65497, + 131101, 262147, 524309, 1048583, 2097169, + 4194319, 8388617, 16777259, 33554467, + 67108879, 134217757, 268435459, 536870923, + 1073741827 + }; + + /* find next biggest hash size */ + newsize = oldsize = symbol->array_size; + + for (i = 0, j = sizeof(sizes)/sizeof(sizes[0]); i < j; i++) { + if (oldsize < sizes[i]) { + newsize = sizes[i]; + break; + } + } + if (newsize == oldsize) { /* table already at max (!) */ + symbol->flags |= ARRAYMAXED; + return; + } + + /* allocate new table */ + emalloc(new, BUCKET **, newsize * sizeof(BUCKET *), "grow_table"); + memset(new, '\0', newsize * sizeof(BUCKET *)); + + old = symbol->buckets; + symbol->buckets = new; + symbol->array_size = newsize; + + /* brand new hash table, set things up and return */ + if (old == NULL) { + symbol->table_size = 0; + return; + } + + /* old hash table there, move stuff to new, free old */ + + /* + * note that symbol->table_size does not change if an old array, + * and is explicitly set to 0 if a new one. + */ + + for (k = 0; k < oldsize; k++) { + for (chain = old[k]; chain != NULL; chain = next) { + next = chain->ahnext; + hash1 = chain->ahcode % newsize; + + /* remove from old list, add to new */ + chain->ahnext = new[hash1]; + new[hash1] = chain; + } + } + efree(old); +} + + +#ifdef ARRAYDEBUG + +static NODE ** +str_option(NODE *opt, NODE *val) +{ + int newval; + NODE *tmp; + NODE **ret = (NODE **) ! NULL; + + tmp = force_string(opt); + (void) force_number(val); + if (STREQ(tmp->stptr, "STR_CHAIN_MAX")) { + newval = (int) val->numbr; + if (newval > 0) + STR_CHAIN_MAX = newval; + } else + ret = NULL; + return ret; +} +#endif + + +/* +From bonzini@gnu.org Mon Oct 28 16:05:26 2002 +Date: Mon, 28 Oct 2002 13:33:03 +0100 +From: Paolo Bonzini +To: arnold@skeeve.com +Subject: Hash function +Message-ID: <20021028123303.GA6832@biancaneve> + +Here is the hash function I'm using in GNU Smalltalk. The scrambling is +needed if you use powers of two as the table sizes. If you use primes it +is not needed. + +To use double-hashing with power-of-two size, you should use the +_gst_hash_string(str, len) as the primary hash and +scramble(_gst_hash_string (str, len)) | 1 as the secondary hash. + +Paolo + +*/ +/* + * ADR: Slightly modified to work w/in the context of gawk. + */ + +static unsigned long +gst_hash_string(const char *str, size_t len, unsigned long hsize, size_t *code) +{ + unsigned long hashVal = 1497032417; /* arbitrary value */ + unsigned long ret; + + while (len--) { + hashVal += *str++; + hashVal += (hashVal << 10); + hashVal ^= (hashVal >> 6); + } + + ret = scramble(hashVal); + + if (code != NULL) + *code = ret; + + if (ret >= hsize) + ret %= hsize; + + return ret; +} + +static unsigned long +scramble(unsigned long x) +{ + if (sizeof(long) == 4) { + int y = ~x; + + x += (y << 10) | (y >> 22); + x += (x << 6) | (x >> 26); + x -= (x << 16) | (x >> 16); + } else { + x ^= (~x) >> 31; + x += (x << 21) | (x >> 11); + x += (x << 5) | (x >> 27); + x += (x << 27) | (x >> 5); + x += (x << 31); + } + + return x; +} -- cgit v1.2.3