Rewrite split program.

author: Arnold D. Robbins <arnold@skeeve.com> 2020-10-01 16:48:51 +0300
committer: Arnold D. Robbins <arnold@skeeve.com> 2020-10-01 16:48:51 +0300
commit: 6a34364cde8eec7df0dd9f1de005babea18e45ec (patch)
tree: 340c1306a41a618af283b6fee0458179b099bf6b /awklib/eg
parent: 2ab1c82b4097cff8763d1ed63be6478edf55eb54 (diff)
download: egawk-6a34364cde8eec7df0dd9f1de005babea18e45ec.tar.gz
egawk-6a34364cde8eec7df0dd9f1de005babea18e45ec.tar.bz2
egawk-6a34364cde8eec7df0dd9f1de005babea18e45ec.zip
1 files changed, 125 insertions, 39 deletions
diff --git a/awklib/eg/prog/split.awk b/awklib/eg/prog/split.awk
index 9239a6c5..6e0ac16b 100644
--- a/awklib/eg/prog/split.awk
+++ b/awklib/eg/prog/split.awk
@@ -1,56 +1,142 @@
 # split.awk --- do split in awk
 #
-# Requires ord() and chr() library functions
+# Requires getopt() library function.
 #
 # Arnold Robbins, arnold@skeeve.com, Public Domain
 # May 1993
 # Revised slightly, May 2014
+# Rewritten September 2020
 
-# usage: split [-count] [file] [outname]
-
+function usage()
+{
+    print("usage: split [-l count]  [-a suffix-len] [file [outname]]") > "/dev/stderr"
+    print("       split [-b N[k|m]] [-a suffix-len] [file [outname]]") > "/dev/stderr"
+    exit 1
+}
 BEGIN {
-    outfile = "x"    # default
-    count = 1000
-    if (ARGC > 4)
-        usage()
-
-    i = 1
-    if (i in ARGV && ARGV[i] ~ /^-[[:digit:]]+$/) {
-        count = -ARGV[i]
-        ARGV[i] = ""
-        i++
+    # Set defaults:
+    Suffix_length = 2
+    Line_count = 1000
+    Byte_count = 0
+    Outfile = "x"
+
+    parse_arguments()
+
+    init_suffix_data()
+
+    Output = (Outfile compute_suffix())
+}
+function parse_arguments(   i, c, l, modifier)
+{
+    while ((c = getopt(ARGC, ARGV, "a:b:l:")) != -1) {
+        if (c == "a")
+            Suffix_length = Optarg + 0
+        else if (c == "b") {
+            Byte_count = Optarg + 0
+            Line_count = 0
+
+            l = length(Optarg)
+            modifier = substr(Optarg, l, 1)
+            if (modifier == "k")
+                Byte_count *= 1024
+            else if (modifier == "m")
+                Byte_count *= 1024 * 1024
+        } else if (c == "l") {
+            Line_count = Optarg + 0
+            Byte_count = 0
+        } else
+            usage()
     }
-    # test argv in case reading from stdin instead of file
-    if (i in ARGV)
-        i++    # skip datafile name
-    if (i in ARGV) {
-        outfile = ARGV[i]
+
+    # Clear out options
+    for (i = 1; i < Optind; i++)
         ARGV[i] = ""
+
+    # Check for filename
+    if (ARGV[Optind]) {
+        Optind++
+
+	# Check for different prefix
+        if (ARGV[Optind]) {
+            Outfile = ARGV[Optind]
+            ARGV[Optind] = ""
+
+            if (++Optind < ARGC)
+                usage()
+        }
     }
-    s1 = s2 = "a"
-    out = (outfile s1 s2)
 }
+function compute_suffix(    i, result, letters)
 {
-    if (++tcount > count) {
-        close(out)
-        if (s2 == "z") {
-            if (s1 == "z") {
-                printf("split: %s is too large to split\n",
-                       FILENAME) > "/dev/stderr"
-                exit 1
-            }
-            s1 = chr(ord(s1) + 1)
-            s2 = "a"
-        }
-        else
-            s2 = chr(ord(s2) + 1)
-        out = (outfile s1 s2)
-        tcount = 1
+    # Logical step 3
+    if (Reached_last) {
+        printf("split: too many files!\n") > "/dev/stderr"
+        exit 1
+    } else if (on_last_file())
+        Reached_last = 1    # fail when wrapping after 'zzz'
+
+    # Logical step 1
+    result = ""
+    letters = "abcdefghijklmnopqrstuvwxyz"
+    for (i = 1; i <= Suffix_length; i++)
+        result = result substr(letters, Suffix_ind[i], 1)
+
+    # Logical step 2
+    for (i = Suffix_length; i >= 1; i--) {
+        if (++Suffix_ind[i] > 26) {
+            Suffix_ind[i] = 1
+        } else
+            break
     }
-    print > out
+
+    return result
 }
-function usage()
+function init_suffix_data(  i)
 {
-    print("usage: split [-num] [file] [outname]") > "/dev/stderr"
-    exit 1
+    for (i = 1; i <= Suffix_length; i++)
+        Suffix_ind[i] = 1
+
+    Reached_last = 0
+}
+function on_last_file(  i, on_last)
+{
+    on_last = 1
+    for (i = 1; i <= Suffix_length; i++) {
+        on_last = on_last && (Suffix_ind[i] == 26)
+    }
+
+    return on_last
+}
+Line_count > 0 {
+    if (++tcount > Line_count) {
+        close(Output)
+        Output = (Outfile compute_suffix())
+        tcount = 1
+    }
+    print > Output
+}
+Byte_count > 0 {
+    # `+ 1' is for the final newline
+    if (tcount + length($0) + 1 > Byte_count) { # would overflow
+        # compute leading bytes
+        leading_bytes = Byte_count - tcount
+
+        # write leading bytes
+        printf("%s", substr($0, 1, leading_bytes)) > Output
+
+        # close old file, open new file
+        close(Output)
+        Output = (Outfile compute_suffix())
+
+        # set up first bytes for new file
+        $0 = substr($0, leading_bytes + 1)  # trailing bytes
+        tcount = 0
+    }
+
+    # write full record or trailing bytes
+    tcount += length($0) + 1
+    print > Output
+}
+END {
+    close(Output)
 }
author	Arnold D. Robbins <arnold@skeeve.com>	2020-10-01 16:48:51 +0300
committer	Arnold D. Robbins <arnold@skeeve.com>	2020-10-01 16:48:51 +0300
commit	6a34364cde8eec7df0dd9f1de005babea18e45ec (patch)
tree	340c1306a41a618af283b6fee0458179b099bf6b /awklib/eg
parent	2ab1c82b4097cff8763d1ed63be6478edf55eb54 (diff)
download	egawk-6a34364cde8eec7df0dd9f1de005babea18e45ec.tar.gz egawk-6a34364cde8eec7df0dd9f1de005babea18e45ec.tar.bz2 egawk-6a34364cde8eec7df0dd9f1de005babea18e45ec.zip