From 005b8909d995b699130ab97269cabab2bcf33a75 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Tue, 2 May 2023 19:29:42 -0700 Subject: sort: support stable sorting via ssort and snsort. For array-like objecgts, these objects use an array-based merge sort, using an auxiliary array equal in size to the original array. To provide the auxiliary array, a new kind of very simple vector-like object is introduced into the gc module: protected array. This looks like a raw dynamic C array of val type, returned as a val *. Under the hood, there is a heap object there, which makes the array traversable by the garbage collector. The whole point of this exercise is to make the new mergesort function safe even if the caller-supplied functions misbehave in such a way that the auxiliary array holds the only references to heap objects. * gc.c (struct prot_array): New struct, (prot_array_cls): New static variable. (gc_late_init): Register COBJ class, retaining in prot_array_cls. (prot_array_mark, prot_array_free): New static functions. (prot_array_ops): New static structure. (prot_array_alloc, prot_array_free): New functions. * gc.h (prot_array_alloc, prot_array_free): Declared. * lib.c (mergesort, ssort_vec): New static function. (snsort, ssort): New functions. * lib.h (snsort, ssort): Declared. * tests/010/sort.tl: Cover ssort. * txr.1: Documented. * stdlib/doc-syms.tl: Updated. --- eval.c | 2 + gc.c | 58 +++++++++++++++++++++++++++++ gc.h | 3 ++ lib.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++ lib.h | 2 + stdlib/doc-syms.tl | 6 ++- tests/010/sort.tl | 22 +++++++++++ txr.1 | 22 +++++++---- 8 files changed, 211 insertions(+), 9 deletions(-) diff --git a/eval.c b/eval.c index 68ea1865..ca9fe25d 100644 --- a/eval.c +++ b/eval.c @@ -7509,6 +7509,8 @@ void eval_init(void) reg_fun(intern(lit("merge"), user_package), func_n4o(merge_wrap, 2)); reg_fun(intern(lit("nsort"), user_package), func_n3o(nsort, 1)); reg_fun(intern(lit("sort"), user_package), func_n3o(sort, 1)); + reg_fun(intern(lit("snsort"), user_package), func_n3o(snsort, 1)); + reg_fun(intern(lit("ssort"), user_package), func_n3o(ssort, 1)); reg_fun(intern(lit("nshuffle"), user_package), func_n2o(nshuffle, 1)); reg_fun(intern(lit("shuffle"), user_package), func_n2o(shuffle, 1)); reg_fun(intern(lit("find"), user_package), func_n4o(find, 2)); diff --git a/gc.c b/gc.c index 25b4c071..2c61c914 100644 --- a/gc.c +++ b/gc.c @@ -136,6 +136,13 @@ int full_gc; val break_obj; #endif +struct prot_array { + cnum size; + val *arr; +}; + +struct cobj_class *prot_array_cls; + val prot1(val *loc) { assert (gc_prot_top < prot_stack_limit); @@ -1132,6 +1139,8 @@ void gc_late_init(void) func_n1(gc_call_finalizers)); reg_fun(intern(lit("set-stack-limit"), user_package), func_n1(set_stack_limit)); reg_fun(intern(lit("get-stack-limit"), user_package), func_n0(get_stack_limit)); + + prot_array_cls = cobj_register(intern(lit("gc-prot-array"), system_package)); } /* @@ -1247,3 +1256,52 @@ void gc_stack_overflow(void) { uw_throwf(stack_overflow_s, lit("computation exceeded stack limit"), nao); } + +static void prot_array_mark(val obj) +{ + struct prot_array *pa = coerce(struct prot_array *, obj->co.handle); + cnum i; + + if (pa->arr) + for (i = 0; i < pa->size; i++) + gc_mark(pa->arr[i]); +} + +static void prot_array_free(val obj) +{ + struct prot_array *pa = coerce(struct prot_array *, obj->co.handle); + + if (pa->arr) { + free(pa->arr - 1); + pa->arr = 0; + } +} + +static struct cobj_ops prot_array_ops = cobj_ops_init(eq, + cobj_print_op, + prot_array_free, + prot_array_mark, + cobj_eq_hash_op); + +val *gc_prot_array_alloc(cnum size, val self) +{ + struct prot_array *pa = convert(struct prot_array *, + chk_malloc(sizeof *pa)); + + if (size >= INT_PTR_MAX) + uw_throwf(error_s, lit("~s: array too large"), self, nao); + + pa->size = size; + pa->arr = convert(val *, + chk_calloc(sizeof *pa->arr, (size + 1))) + 1; + + pa->arr[-1] = cobj(convert(mem_t *, pa), prot_array_cls, &prot_array_ops); + + return pa->arr; +} + +void gc_prot_array_free(val *arr) +{ + if (arr) + prot_array_free(arr[-1]); +} diff --git a/gc.h b/gc.h index ea09cfad..58d057db 100644 --- a/gc.h +++ b/gc.h @@ -86,3 +86,6 @@ INLINE void gc_stack_check(void) if (&v < gc_stack_limit) gc_stack_overflow(); } + +val *gc_prot_array_alloc(cnum size, val self); +void gc_prot_array_free(val *); diff --git a/lib.c b/lib.c index c721a01f..bea09ea3 100644 --- a/lib.c +++ b/lib.c @@ -10874,6 +10874,60 @@ static void sort_vec(val vec, val lessfun, val keyfun, val self) quicksort(vec, lessfun, keyfun, 0, len); } +static void mergesort(val vec, val lessfun, val keyfun, cnum from, cnum to, + val *aux) +{ + switch (to - from) { + case 0: + case 1: + break; + case 2: + if (funcall2(lessfun, + funcall1(keyfun, ref(vec, num_fast(from + 1))), + funcall1(keyfun, ref(vec, num_fast(from))))) + swap(vec, num_fast(from), num_fast(from + 1)); + break; + default: + { + cnum mid = from + (to - from) / 2; + cnum i, j, k; + + mergesort(vec, lessfun, keyfun, from, mid, aux); + mergesort(vec, lessfun, keyfun, mid, to, aux); + + for (i = from, j = mid, k = 0; i < mid && j < to; ) + { + if (funcall2(lessfun, + funcall1(keyfun, ref(vec, num_fast(i))), + funcall1(keyfun, ref(vec, num_fast(j))))) + { + aux[k++] = ref(vec, num_fast(i++)); + } else { + aux[k++] = ref(vec, num_fast(j++)); + } + } + + while (i < mid) + aux[k++] = ref(vec, num_fast(i++)); + while (j < to) + aux[k++] = ref(vec, num_fast(j++)); + + for (i = from, k = 0; i < to; i++, k++) + refset(vec, num_fast(i), aux[k]); + } + break; + } +} + +static void ssort_vec(val vec, val lessfun, val keyfun, val self) +{ + cnum len = c_fixnum(length(vec), self); + val *aux = gc_prot_array_alloc(len, self); + mergesort(vec, lessfun, keyfun, 0, len, aux); + gc_prot_array_free(aux); +} + + val nsort(val seq, val lessfun, val keyfun) { val self = lit("nsort"); @@ -10925,6 +10979,57 @@ val sort(val seq, val lessfun, val keyfun) abort(); } +val snsort(val seq, val lessfun, val keyfun) +{ + val self = lit("snsort"); + seq_info_t si = seq_info(seq); + + keyfun = default_arg(keyfun, identity_f); + lessfun = default_arg(lessfun, less_f); + + switch (si.kind) { + case SEQ_NIL: + return nil; + case SEQ_VECLIKE: + case SEQ_HASHLIKE: + ssort_vec(seq, lessfun, keyfun, self); + return seq; + case SEQ_LISTLIKE: + return sort_list(seq, lessfun, keyfun); + case SEQ_TREELIKE: + case SEQ_NOTSEQ: + unsup_obj(self, seq); + } + + abort(); +} + +val ssort(val seq, val lessfun, val keyfun) +{ + val self = lit("ssort"); + seq_info_t si = seq_info(seq); + + keyfun = default_arg(keyfun, identity_f); + lessfun = default_arg(lessfun, less_f); + + switch (si.kind) { + case SEQ_NIL: + return nil; + case SEQ_VECLIKE: + case SEQ_HASHLIKE: + seq = copy(seq); + ssort_vec(seq, lessfun, keyfun, self); + return seq; + case SEQ_LISTLIKE: + return sort_list(copy_list(seq), lessfun, keyfun); + case SEQ_TREELIKE: + case SEQ_NOTSEQ: + unsup_obj(self, seq); + } + + abort(); +} + val nshuffle(val seq, val randstate) { seq_info_t si = seq_info(seq); diff --git a/lib.h b/lib.h index f7e6f992..9e71b5f9 100644 --- a/lib.h +++ b/lib.h @@ -1319,6 +1319,8 @@ val interpose(val sep, val seq); val merge(val list1, val list2, val lessfun, val keyfun); val nsort(val seq, val lessfun, val keyfun); val sort(val seq, val lessfun, val keyfun); +val snsort(val seq, val lessfun, val keyfun); +val ssort(val seq, val lessfun, val keyfun); val nshuffle(val seq, val randstate); val shuffle(val seq, val randstate); val multi_sort(val lists, val funcs, val key_funcs); diff --git a/stdlib/doc-syms.tl b/stdlib/doc-syms.tl index 029cecbd..f3e6e9b7 100644 --- a/stdlib/doc-syms.tl +++ b/stdlib/doc-syms.tl @@ -1346,7 +1346,7 @@ ("nreverse" "N-03D8471B") ("nrot" "N-025DB962") ("nshuffle" "N-01F12561") - ("nsort" "N-01FE5176") + ("nsort" "N-03923640") ("nth" "N-0039F3FB") ("nthcdr" "N-03D71D22") ("nthlast" "N-02FC66FA") @@ -1810,6 +1810,7 @@ ("slots" "N-00E90177") ("slotset" "N-02657437") ("sme" "N-008C6621") + ("snsort" "N-03923640") ("so-acceptconn" "N-02FFF4E8") ("so-broadcast" "N-02FFF4E8") ("so-debug" "N-02FFF4E8") @@ -1851,7 +1852,7 @@ ("some" "D-0041") ("some-false" "N-016BDF48") ("some-true" "N-016BDF48") - ("sort" "N-01FE5176") + ("sort" "N-03923640") ("sort-group" "N-01E65DDC") ("source-loc" "N-0370CD69") ("source-loc-str" "N-0370CD69") @@ -1868,6 +1869,7 @@ ("sqrt" "D-0025") ("square" "D-0031") ("ssize-t" "N-01153D9E") + ("ssort" "N-03923640") ("sspl" "N-0296195B") ("sssucc" "N-038E636C") ("ssucc" "N-038E636C") diff --git a/tests/010/sort.tl b/tests/010/sort.tl index 1fd48531..1fcc0c2d 100644 --- a/tests/010/sort.tl +++ b/tests/010/sort.tl @@ -21,3 +21,25 @@ (sort slist) list (sort list (fun greater)) (reverse list) (sort slist (fun greater)) (reverse list))) + +(test (ssort ()) nil) + +(let* ((list (conses '(1 2 3 4 5 6 7 8))) + (sp (uniq [mapcar ssort (perm list (len list))]))) + (mvtest (len sp) 1 + (car sp) list)) + +(test (ssort #()) #()) + +(let* ((vec (conses #(1 2 3 4 5 6 7 8))) + (sp (uniq [mapcar ssort (perm vec (len vec))]))) + (mvtest (len sp) 1 + (car sp) vec)) + +(let* ((list (range* 0 1000)) + (slist (shuffle list))) + (mvtest + (ssort list) list + (ssort slist) list + (ssort list (fun greater)) (reverse list) + (ssort slist (fun greater)) (reverse list))) diff --git a/txr.1 b/txr.1 index 6927bd13..606e7f8d 100644 --- a/txr.1 +++ b/txr.1 @@ -37716,10 +37716,12 @@ is a transformed list of rows which is reconstituted into a list of columns. ;; (op remove-if (ap eql @3 20)) .brev -.coNP Functions @ sort and @ nsort +.coNP Functions @, sort @, nsort @ ssort and @ snsort .synb .mets (sort < sequence >> [ lessfun <> [ keyfun ]]) .mets (nsort < sequence >> [ lessfun <> [ keyfun ]]) +.mets (ssort < sequence >> [ lessfun <> [ keyfun ]]) +.mets (snsort < sequence >> [ lessfun <> [ keyfun ]]) .syne .desc The @@ -37784,19 +37786,25 @@ For strings and vectors, is not stable. The -.code sort +.code ssort and .code nsort -functions can be applied to hashes. It produces meaningful behavior -for a hash table which contains +functions have the same argument syntax and semantics as, respectively, +.code sort +and +.codn nsort . +These functions provide a stable sort for all sequences, not only +lists, at the cost of temporarily allocating memory. + +All of these functions can be applied to hashes. They produce meaningful +behavior for a hash table which contains .I N keys which are the integers from 0 to .IR "N - 1" . Such as hash is treated as if it were a vector. The values are sorted and reassigned to sorted order to the integer keys. -The behavior of -.code sort -is not specified for hashes whose contents do not conform to this convention. +The behavior is not specified for hashes whose contents do not conform to this +convention. Note: .code nsort -- cgit v1.2.3