diff options
-rw-r--r-- | awklib/eg/prog/split.awk | 164 | ||||
-rw-r--r-- | doc/ChangeLog | 6 | ||||
-rw-r--r-- | doc/gawk.info | 848 | ||||
-rw-r--r-- | doc/gawk.texi | 372 | ||||
-rw-r--r-- | doc/gawktexi.in | 372 | ||||
-rw-r--r-- | doc/wordlist | 13 |
6 files changed, 1225 insertions, 550 deletions
diff --git a/awklib/eg/prog/split.awk b/awklib/eg/prog/split.awk index 9239a6c5..6e0ac16b 100644 --- a/awklib/eg/prog/split.awk +++ b/awklib/eg/prog/split.awk @@ -1,56 +1,142 @@ # split.awk --- do split in awk # -# Requires ord() and chr() library functions +# Requires getopt() library function. # # Arnold Robbins, arnold@skeeve.com, Public Domain # May 1993 # Revised slightly, May 2014 +# Rewritten September 2020 -# usage: split [-count] [file] [outname] - +function usage() +{ + print("usage: split [-l count] [-a suffix-len] [file [outname]]") > "/dev/stderr" + print(" split [-b N[k|m]] [-a suffix-len] [file [outname]]") > "/dev/stderr" + exit 1 +} BEGIN { - outfile = "x" # default - count = 1000 - if (ARGC > 4) - usage() - - i = 1 - if (i in ARGV && ARGV[i] ~ /^-[[:digit:]]+$/) { - count = -ARGV[i] - ARGV[i] = "" - i++ + # Set defaults: + Suffix_length = 2 + Line_count = 1000 + Byte_count = 0 + Outfile = "x" + + parse_arguments() + + init_suffix_data() + + Output = (Outfile compute_suffix()) +} +function parse_arguments( i, c, l, modifier) +{ + while ((c = getopt(ARGC, ARGV, "a:b:l:")) != -1) { + if (c == "a") + Suffix_length = Optarg + 0 + else if (c == "b") { + Byte_count = Optarg + 0 + Line_count = 0 + + l = length(Optarg) + modifier = substr(Optarg, l, 1) + if (modifier == "k") + Byte_count *= 1024 + else if (modifier == "m") + Byte_count *= 1024 * 1024 + } else if (c == "l") { + Line_count = Optarg + 0 + Byte_count = 0 + } else + usage() } - # test argv in case reading from stdin instead of file - if (i in ARGV) - i++ # skip datafile name - if (i in ARGV) { - outfile = ARGV[i] + + # Clear out options + for (i = 1; i < Optind; i++) ARGV[i] = "" + + # Check for filename + if (ARGV[Optind]) { + Optind++ + + # Check for different prefix + if (ARGV[Optind]) { + Outfile = ARGV[Optind] + ARGV[Optind] = "" + + if (++Optind < ARGC) + usage() + } } - s1 = s2 = "a" - out = (outfile s1 s2) } +function compute_suffix( i, result, letters) { - if (++tcount > count) { - close(out) - if (s2 == "z") { - if (s1 == "z") { - printf("split: %s is too large to split\n", - FILENAME) > "/dev/stderr" - exit 1 - } - s1 = chr(ord(s1) + 1) - s2 = "a" - } - else - s2 = chr(ord(s2) + 1) - out = (outfile s1 s2) - tcount = 1 + # Logical step 3 + if (Reached_last) { + printf("split: too many files!\n") > "/dev/stderr" + exit 1 + } else if (on_last_file()) + Reached_last = 1 # fail when wrapping after 'zzz' + + # Logical step 1 + result = "" + letters = "abcdefghijklmnopqrstuvwxyz" + for (i = 1; i <= Suffix_length; i++) + result = result substr(letters, Suffix_ind[i], 1) + + # Logical step 2 + for (i = Suffix_length; i >= 1; i--) { + if (++Suffix_ind[i] > 26) { + Suffix_ind[i] = 1 + } else + break } - print > out + + return result } -function usage() +function init_suffix_data( i) { - print("usage: split [-num] [file] [outname]") > "/dev/stderr" - exit 1 + for (i = 1; i <= Suffix_length; i++) + Suffix_ind[i] = 1 + + Reached_last = 0 +} +function on_last_file( i, on_last) +{ + on_last = 1 + for (i = 1; i <= Suffix_length; i++) { + on_last = on_last && (Suffix_ind[i] == 26) + } + + return on_last +} +Line_count > 0 { + if (++tcount > Line_count) { + close(Output) + Output = (Outfile compute_suffix()) + tcount = 1 + } + print > Output +} +Byte_count > 0 { + # `+ 1' is for the final newline + if (tcount + length($0) + 1 > Byte_count) { # would overflow + # compute leading bytes + leading_bytes = Byte_count - tcount + + # write leading bytes + printf("%s", substr($0, 1, leading_bytes)) > Output + + # close old file, open new file + close(Output) + Output = (Outfile compute_suffix()) + + # set up first bytes for new file + $0 = substr($0, leading_bytes + 1) # trailing bytes + tcount = 0 + } + + # write full record or trailing bytes + tcount += length($0) + 1 + print > Output +} +END { + close(Output) } diff --git a/doc/ChangeLog b/doc/ChangeLog index bb6aa39e..e6486312 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,9 @@ +2020-10-01 Arnold D. Robbins <arnold@skeeve.com> + + * gawktexi.in (Split Program): Rewrite split to be POSIX + compliant. Update all the prose. + * wordlist: Update. + 2020-09-24 Arnold D. Robbins <arnold@skeeve.com> * gawktexi.in: Fix a spelling error. diff --git a/doc/gawk.info b/doc/gawk.info index a3c9a803..5523f244 100644 --- a/doc/gawk.info +++ b/doc/gawk.info @@ -18418,103 +18418,281 @@ File: gawk.info, Node: Split Program, Next: Tee Program, Prev: Id Program, U 11.2.4 Splitting a Large File into Pieces ----------------------------------------- -The 'split' program splits large text files into smaller pieces. Usage -is as follows:(1) +The 'split' utility splits large text files into smaller pieces. The +usage follows the POSIX standard for 'split' and is as follows: - 'split' ['-COUNT'] [FILE] [PREFIX] + 'split' ['-l' COUNT] ['-a' SUFFIX-LEN] [FILE [OUTNAME]] + 'split' '-b' N['k'|'m']] ['-a' SUFFIX-LEN] [FILE [OUTNAME]] By default, the output files are named 'xaa', 'xab', and so on. Each file has 1,000 lines in it, with the likely exception of the last file. -To change the number of lines in each file, supply a number on the -command line preceded with a minus sign (e.g., '-500' for files with 500 -lines in them instead of 1,000). To change the names of the output -files to something like 'myfileaa', 'myfileab', and so on, supply an -additional argument that specifies the file name prefix. - - Here is a version of 'split' in 'awk'. It uses the 'ord()' and -'chr()' functions presented in *note Ordinal Functions::. - - The program first sets its defaults, and then tests to make sure -there are not too many arguments. It then looks at each argument in -turn. The first argument could be a minus sign followed by a number. -If it is, this happens to look like a negative number, so it is made -positive, and that is the count of lines. The data file name is skipped -over and the final argument is used as the prefix for the output file -names: + + The 'split' program has evolved over time, and the current POSIX +version is more complicated than the original Unix version. The options +and what they do are as follows: + +'-a' SUFFIX-LEN + Use SUFFIX-LEN characters for the suffix. For example, if + SUFFIX-LEN is four, the output files would range from 'xaaaa' to + 'xzzzz'. + +'-b' N['k'|'m']] + Instead of each file containing a specified number of lines, each + file should have (at most) N bytes. Supplying a trailing 'k' + multiplies N by 1,024, yielding kilobytes. Supplying a trailing + 'm' mutiplies N by 1,048,576 (1,024 * 1,024) yielding megabytes. + (This option is mutually exclusive with '-l'). + +'-l' COUNT + Each file should have at most COUNT lines, instead of the default + 1,000. (This option is mutually exclusive with '-b'). + + If supplied, FILE is the input file to read. Otherwise standard +input is processed. If supplied, OUTNAME is the leading prefix to use +for file names, instead of 'x'. + + In order to use the '-b' option, 'gawk' should be invoked with its +'-b' option (*note Options::), or with the environment variable 'LC_ALL' +set to 'C', so that each input byte is treated as a separate +character.(1) + + Here is an implementation of 'split' in 'awk'. It uses the +'getopt()' function presented in *note Getopt Function::. + + The program begins with a standard descriptive comment and then a +'usage()' function describing the options: # split.awk --- do split in awk # - # Requires ord() and chr() library functions - # usage: split [-count] [file] [outname] + # Requires getopt() library function. + function usage() + { + print("usage: split [-l count] [-a suffix-len] [file [outname]]") > "/dev/stderr" + print(" split [-b N[k|m]] [-a suffix-len] [file [outname]]") > "/dev/stderr" + exit 1 + } + + Next, in a 'BEGIN' rule we set the default values and parse the +arguments. After that we initialize the data structures used to cycle +the suffix from 'aa...' to 'zz...'. Finally we set the name of the +first output file: BEGIN { - outfile = "x" # default - count = 1000 - if (ARGC > 4) - usage() + # Set defaults: + Suffix_length = 2 + Line_count = 1000 + Byte_count = 0 + Outfile = "x" - i = 1 - if (i in ARGV && ARGV[i] ~ /^-[[:digit:]]+$/) { - count = -ARGV[i] - ARGV[i] = "" - i++ + parse_arguments() + + init_suffix_data() + + Output = (Outfile compute_suffix()) + } + + Parsing the arguments is straightforward. The program follows our +convention (*note Library Names::) of having important global variables +start with an uppercase letter: + + function parse_arguments( i, c, l, modifier) + { + while ((c = getopt(ARGC, ARGV, "a:b:l:")) != -1) { + if (c == "a") + Suffix_length = Optarg + 0 + else if (c == "b") { + Byte_count = Optarg + 0 + Line_count = 0 + + l = length(Optarg) + modifier = substr(Optarg, l, 1) + if (modifier == "k") + Byte_count *= 1024 + else if (modifier == "m") + Byte_count *= 1024 * 1024 + } else if (c == "l") { + Line_count = Optarg + 0 + Byte_count = 0 + } else + usage() } - # test argv in case reading from stdin instead of file - if (i in ARGV) - i++ # skip datafile name - if (i in ARGV) { - outfile = ARGV[i] + + # Clear out options + for (i = 1; i < Optind; i++) ARGV[i] = "" + + # Check for filename + if (ARGV[Optind]) { + Optind++ + + # Check for different prefix + if (ARGV[Optind]) { + Outfile = ARGV[Optind] + ARGV[Optind] = "" + + if (++Optind < ARGC) + usage() + } + } + } + + Managing the file name suffix is interesting. Given a suffix of +length three, say, the values go from 'aaa', 'aab', 'aac' and so on, all +the way to 'zzx', 'zzy', and finally 'zzz'. There are two important +aspects to this: + + * We have to be able to easily generate these suffixes, and in + particular easily handle "rolling over"; for example, going from + 'abz' to 'aca'. + + * We have to tell when we've finished with the last file, so that if + we still have more input data we can print an error message and + exit. The trick is to handle this _after_ using the last suffix, + and not when the final suffix is created. + + The computation is handled by 'compute_suffix()'. This function is +called every time a new file is opened. + + The flow here is messy, because we want to generate 'zzzz' (say), and +use it, and only produce an error after all the file name suffixes have +been used up. The logical steps are as follows: + + 1. Generate the suffix, saving the value in 'result' to return. To do + this, the supplementary array 'Suffix_ind' contains one element for + each letter in the suffix. Each element ranges from 1 to 26, + acting as the index into a string containing all the lowercase + letters of the English alphabet. It is initialized by + 'init_suffix_data()'. 'result' is built up one letter at a time, + using each 'substr()'. + + 2. Prepare the data structures for the next time 'compute_suffix()' is + called. To do this, we loop over 'Suffix_ind', _backwards_. If + the current element is less than 26, it's incremented and the loop + breaks ('abq' goes to 'abr'). Otherwise, the element is reset to + one and we move down the list ('abz' to 'aca'). Thus, the + 'Suffix_ind' array is always "one step ahead" of the actual file + name suffix to be returned. + + 3. Check if we've gone past the limit of possible filenames. If + 'Reached_last' is true, print a message and exit. Otherwise, check + if 'Suffix_ind' describes a suffix where all the letters are 'z'. + If that's the case we're about to return the final suffix. If so, + we set 'Reached_last' to true so that the _next_ call to + 'compute_suffix()' will cause a failure. + + Physically, the steps in the function occur in the order 3, 1, 2: + + function compute_suffix( i, result, letters) + { + # Logical step 3 + if (Reached_last) { + printf("split: too many files!\n") > "/dev/stderr" + exit 1 + } else if (on_last_file()) + Reached_last = 1 # fail when wrapping after 'zzz' + + # Logical step 1 + result = "" + letters = "abcdefghijklmnopqrstuvwxyz" + for (i = 1; i <= Suffix_length; i++) + result = result substr(letters, Suffix_ind[i], 1) + + # Logical step 2 + for (i = Suffix_length; i >= 1; i--) { + if (++Suffix_ind[i] > 26) { + Suffix_ind[i] = 1 + } else + break } - s1 = s2 = "a" - out = (outfile s1 s2) + + return result } - The next rule does most of the work. 'tcount' (temporary count) -tracks how many lines have been printed to the output file so far. If -it is greater than 'count', it is time to close the current file and -start a new one. 's1' and 's2' track the current suffixes for the file -name. If they are both 'z', the file is just too big. Otherwise, 's1' -moves to the next letter in the alphabet and 's2' starts over again at -'a': + The 'Suffix_ind' array and 'Reached_last' are initialized by +'init_suffix_data()': + function init_suffix_data( i) { - if (++tcount > count) { - close(out) - if (s2 == "z") { - if (s1 == "z") { - printf("split: %s is too large to split\n", - FILENAME) > "/dev/stderr" - exit 1 - } - s1 = chr(ord(s1) + 1) - s2 = "a" - } - else - s2 = chr(ord(s2) + 1) - out = (outfile s1 s2) + for (i = 1; i <= Suffix_length; i++) + Suffix_ind[i] = 1 + + Reached_last = 0 + } + + The function 'on_last_file()' returns true if 'Suffix_ind' describes +a suffix where all the letters are 'z' by checking that all the elements +in the array are equal to 26: + + function on_last_file( i, on_last) + { + on_last = 1 + for (i = 1; i <= Suffix_length; i++) { + on_last = on_last && (Suffix_ind[i] == 26) + } + + return on_last + } + + The actual work of splitting the input file is done by the next two +rules. Since splitting by line count and splitting by byte count are +mutually exclusive, we simply use two separate rules, one for when +'Line_count' is greater than zero, and another for when 'Byte_count' is +greater than zero. + + The variable 'tcount' counts how many lines have been processed so +far. When it exceeds 'Line_count', it's time to close the previous file +and switch to a new one: + + Line_count > 0 { + if (++tcount > Line_count) { + close(Output) + Output = (Outfile compute_suffix()) tcount = 1 } - print > out + print > Output } -The 'usage()' function simply prints an error message and exits: + The rule for handling bytes is more complicated. Since lines most +likely vary in length, the 'Byte_count' boundary may be hit in the +middle of an input record. In that case, 'split' has to write enough of +the first bytes of the input record to finish up 'Byte_count' bytes, +close the file, open a new file, and write the rest of the record to the +new file. The logic here does all that: - function usage() - { - print("usage: split [-num] [file] [outname]") > "/dev/stderr" - exit 1 + Byte_count > 0 { + # `+ 1' is for the final newline + if (tcount + length($0) + 1 > Byte_count) { # would overflow + # compute leading bytes + leading_bytes = Byte_count - tcount + + # write leading bytes + printf("%s", substr($0, 1, leading_bytes)) > Output + + # close old file, open new file + close(Output) + Output = (Outfile compute_suffix()) + + # set up first bytes for new file + $0 = substr($0, leading_bytes + 1) # trailing bytes + tcount = 0 + } + + # write full record or trailing bytes + tcount += length($0) + 1 + print > Output } - This program is a bit sloppy; it relies on 'awk' to automatically -close the last file instead of doing it in an 'END' rule. It also -assumes that letters are contiguous in the character set, which isn't -true for EBCDIC systems. + Finally, the 'END' rule cleans up by closing the last output file: + + END { + close(Output) + } ---------- Footnotes ---------- - (1) This is the traditional usage. The POSIX usage is different, but -not relevant for what the program aims to demonstrate. + (1) Using '-b' twice requires separating 'gawk''s options from those +of the program. For example: 'gawk -f getopt.awk -f split.awk -b -- -b +42m large-file.txt split-'. File: gawk.info, Node: Tee Program, Next: Uniq Program, Prev: Split Program, Up: Clones @@ -37254,7 +37432,7 @@ Index * split string into array: String Functions. (line 303) * split utility: Split Program. (line 6) * split() function, array elements, deleting: Delete. (line 61) -* split.awk program: Split Program. (line 30) +* split.awk program: Split Program. (line 50) * sprintf: OFMT. (line 15) * sprintf <1>: String Functions. (line 395) * sprintf() function, print/printf statements and: Round Function. @@ -37958,270 +38136,270 @@ Node: Cut Program728246 Node: Egrep Program738175 Node: Id Program747186 Node: Split Program757133 -Ref: Split Program-Footnote-1760591 -Node: Tee Program760720 -Node: Uniq Program763510 -Node: Wc Program771074 -Ref: Wc Program-Footnote-1775329 -Node: Miscellaneous Programs775423 -Node: Dupword Program776636 -Node: Alarm Program778666 -Node: Translate Program783521 -Ref: Translate Program-Footnote-1788086 -Node: Labels Program788356 -Ref: Labels Program-Footnote-1791707 -Node: Word Sorting791791 -Node: History Sorting795863 -Node: Extract Program798088 -Node: Simple Sed806142 -Node: Igawk Program809216 -Ref: Igawk Program-Footnote-1823547 -Ref: Igawk Program-Footnote-2823749 -Ref: Igawk Program-Footnote-3823871 -Node: Anagram Program823986 -Node: Signature Program827048 -Node: Programs Summary828295 -Node: Programs Exercises829509 -Ref: Programs Exercises-Footnote-1833639 -Node: Advanced Features833725 -Node: Nondecimal Data835715 -Node: Array Sorting837306 -Node: Controlling Array Traversal838006 -Ref: Controlling Array Traversal-Footnote-1846374 -Node: Array Sorting Functions846492 -Ref: Array Sorting Functions-Footnote-1851583 -Node: Two-way I/O851779 -Ref: Two-way I/O-Footnote-1859500 -Ref: Two-way I/O-Footnote-2859687 -Node: TCP/IP Networking859769 -Node: Profiling862887 -Node: Advanced Features Summary872201 -Node: Internationalization874045 -Node: I18N and L10N875525 -Node: Explaining gettext876212 -Ref: Explaining gettext-Footnote-1882104 -Ref: Explaining gettext-Footnote-2882289 -Node: Programmer i18n882454 -Ref: Programmer i18n-Footnote-1887403 -Node: Translator i18n887452 -Node: String Extraction888246 -Ref: String Extraction-Footnote-1889378 -Node: Printf Ordering889464 -Ref: Printf Ordering-Footnote-1892250 -Node: I18N Portability892314 -Ref: I18N Portability-Footnote-1894770 -Node: I18N Example894833 -Ref: I18N Example-Footnote-1898108 -Ref: I18N Example-Footnote-2898181 -Node: Gawk I18N898290 -Node: I18N Summary898939 -Node: Debugger900280 -Node: Debugging901280 -Node: Debugging Concepts901721 -Node: Debugging Terms903530 -Node: Awk Debugging906105 -Ref: Awk Debugging-Footnote-1907050 -Node: Sample Debugging Session907182 -Node: Debugger Invocation907716 -Node: Finding The Bug909102 -Node: List of Debugger Commands915576 -Node: Breakpoint Control916909 -Node: Debugger Execution Control920603 -Node: Viewing And Changing Data923965 -Node: Execution Stack927506 -Node: Debugger Info929143 -Node: Miscellaneous Debugger Commands933214 -Node: Readline Support938276 -Node: Limitations939172 -Node: Debugging Summary941726 -Node: Namespaces943005 -Node: Global Namespace944116 -Node: Qualified Names945514 -Node: Default Namespace946513 -Node: Changing The Namespace947254 -Node: Naming Rules948868 -Node: Internal Name Management950716 -Node: Namespace Example951758 -Node: Namespace And Features954320 -Node: Namespace Summary955755 -Node: Arbitrary Precision Arithmetic957232 -Node: Computer Arithmetic958719 -Ref: table-numeric-ranges962485 -Ref: table-floating-point-ranges962978 -Ref: Computer Arithmetic-Footnote-1963636 -Node: Math Definitions963693 -Ref: table-ieee-formats967009 -Ref: Math Definitions-Footnote-1967612 -Node: MPFR features967717 -Node: FP Math Caution969435 -Ref: FP Math Caution-Footnote-1970507 -Node: Inexactness of computations970876 -Node: Inexact representation971836 -Node: Comparing FP Values973196 -Node: Errors accumulate974437 -Node: Getting Accuracy975870 -Node: Try To Round978580 -Node: Setting precision979479 -Ref: table-predefined-precision-strings980176 -Node: Setting the rounding mode982006 -Ref: table-gawk-rounding-modes982380 -Ref: Setting the rounding mode-Footnote-1986311 -Node: Arbitrary Precision Integers986490 -Ref: Arbitrary Precision Integers-Footnote-1989665 -Node: Checking for MPFR989814 -Node: POSIX Floating Point Problems991288 -Ref: POSIX Floating Point Problems-Footnote-1995573 -Node: Floating point summary995611 -Node: Dynamic Extensions997801 -Node: Extension Intro999354 -Node: Plugin License1000620 -Node: Extension Mechanism Outline1001417 -Ref: figure-load-extension1001856 -Ref: figure-register-new-function1003421 -Ref: figure-call-new-function1004513 -Node: Extension API Description1006575 -Node: Extension API Functions Introduction1008288 -Ref: table-api-std-headers1010124 -Node: General Data Types1014373 -Ref: General Data Types-Footnote-11023003 -Node: Memory Allocation Functions1023302 -Ref: Memory Allocation Functions-Footnote-11027803 -Node: Constructor Functions1027902 -Node: API Ownership of MPFR and GMP Values1031368 -Node: Registration Functions1032681 -Node: Extension Functions1033381 -Node: Exit Callback Functions1038703 -Node: Extension Version String1039953 -Node: Input Parsers1040616 -Node: Output Wrappers1053337 -Node: Two-way processors1057849 -Node: Printing Messages1060114 -Ref: Printing Messages-Footnote-11061285 -Node: Updating ERRNO1061438 -Node: Requesting Values1062177 -Ref: table-value-types-returned1062914 -Node: Accessing Parameters1063850 -Node: Symbol Table Access1065087 -Node: Symbol table by name1065599 -Ref: Symbol table by name-Footnote-11068623 -Node: Symbol table by cookie1068751 -Ref: Symbol table by cookie-Footnote-11072936 -Node: Cached values1073000 -Ref: Cached values-Footnote-11076536 -Node: Array Manipulation1076689 -Ref: Array Manipulation-Footnote-11077780 -Node: Array Data Types1077817 -Ref: Array Data Types-Footnote-11080475 -Node: Array Functions1080567 -Node: Flattening Arrays1085065 -Node: Creating Arrays1092041 -Node: Redirection API1096808 -Node: Extension API Variables1099641 -Node: Extension Versioning1100352 -Ref: gawk-api-version1100781 -Node: Extension GMP/MPFR Versioning1102512 -Node: Extension API Informational Variables1104140 -Node: Extension API Boilerplate1105213 -Node: Changes from API V11109187 -Node: Finding Extensions1110759 -Node: Extension Example1111318 -Node: Internal File Description1112116 -Node: Internal File Ops1116196 -Ref: Internal File Ops-Footnote-11127546 -Node: Using Internal File Ops1127686 -Ref: Using Internal File Ops-Footnote-11130069 -Node: Extension Samples1130343 -Node: Extension Sample File Functions1131872 -Node: Extension Sample Fnmatch1139521 -Node: Extension Sample Fork1141008 -Node: Extension Sample Inplace1142226 -Node: Extension Sample Ord1145851 -Node: Extension Sample Readdir1146687 -Ref: table-readdir-file-types1147576 -Node: Extension Sample Revout1148643 -Node: Extension Sample Rev2way1149232 -Node: Extension Sample Read write array1149972 -Node: Extension Sample Readfile1151914 -Node: Extension Sample Time1153009 -Node: Extension Sample API Tests1154761 -Node: gawkextlib1155253 -Node: Extension summary1158171 -Node: Extension Exercises1161873 -Node: Language History1163115 -Node: V7/SVR3.11164771 -Node: SVR41166923 -Node: POSIX1168357 -Node: BTL1169738 -Node: POSIX/GNU1170467 -Node: Feature History1176245 -Node: Common Extensions1192564 -Node: Ranges and Locales1193847 -Ref: Ranges and Locales-Footnote-11198463 -Ref: Ranges and Locales-Footnote-21198490 -Ref: Ranges and Locales-Footnote-31198725 -Node: Contributors1198948 -Node: History summary1204945 -Node: Installation1206325 -Node: Gawk Distribution1207269 -Node: Getting1207753 -Node: Extracting1208716 -Node: Distribution contents1210354 -Node: Unix Installation1216834 -Node: Quick Installation1217516 -Node: Shell Startup Files1219930 -Node: Additional Configuration Options1221019 -Node: Configuration Philosophy1223334 -Node: Non-Unix Installation1225703 -Node: PC Installation1226163 -Node: PC Binary Installation1227001 -Node: PC Compiling1227436 -Node: PC Using1228553 -Node: Cygwin1232106 -Node: MSYS1233330 -Node: VMS Installation1233932 -Node: VMS Compilation1234723 -Ref: VMS Compilation-Footnote-11235952 -Node: VMS Dynamic Extensions1236010 -Node: VMS Installation Details1237695 -Node: VMS Running1239948 -Node: VMS GNV1244227 -Node: VMS Old Gawk1244962 -Node: Bugs1245433 -Node: Bug address1246096 -Node: Usenet1249078 -Node: Maintainers1250082 -Node: Other Versions1251267 -Node: Installation summary1258355 -Node: Notes1259564 -Node: Compatibility Mode1260358 -Node: Additions1261140 -Node: Accessing The Source1262065 -Node: Adding Code1263502 -Node: New Ports1269721 -Node: Derived Files1274096 -Ref: Derived Files-Footnote-11279756 -Ref: Derived Files-Footnote-21279791 -Ref: Derived Files-Footnote-31280389 -Node: Future Extensions1280503 -Node: Implementation Limitations1281161 -Node: Extension Design1282371 -Node: Old Extension Problems1283515 -Ref: Old Extension Problems-Footnote-11285033 -Node: Extension New Mechanism Goals1285090 -Ref: Extension New Mechanism Goals-Footnote-11288454 -Node: Extension Other Design Decisions1288643 -Node: Extension Future Growth1290756 -Node: Notes summary1291362 -Node: Basic Concepts1292520 -Node: Basic High Level1293201 -Ref: figure-general-flow1293483 -Ref: figure-process-flow1294168 -Ref: Basic High Level-Footnote-11297469 -Node: Basic Data Typing1297654 -Node: Glossary1300982 -Node: Copying1332867 -Node: GNU Free Documentation License1370410 -Node: Index1395530 +Ref: Split Program-Footnote-1766905 +Node: Tee Program767078 +Node: Uniq Program769868 +Node: Wc Program777432 +Ref: Wc Program-Footnote-1781687 +Node: Miscellaneous Programs781781 +Node: Dupword Program782994 +Node: Alarm Program785024 +Node: Translate Program789879 +Ref: Translate Program-Footnote-1794444 +Node: Labels Program794714 +Ref: Labels Program-Footnote-1798065 +Node: Word Sorting798149 +Node: History Sorting802221 +Node: Extract Program804446 +Node: Simple Sed812500 +Node: Igawk Program815574 +Ref: Igawk Program-Footnote-1829905 +Ref: Igawk Program-Footnote-2830107 +Ref: Igawk Program-Footnote-3830229 +Node: Anagram Program830344 +Node: Signature Program833406 +Node: Programs Summary834653 +Node: Programs Exercises835867 +Ref: Programs Exercises-Footnote-1839997 +Node: Advanced Features840083 +Node: Nondecimal Data842073 +Node: Array Sorting843664 +Node: Controlling Array Traversal844364 +Ref: Controlling Array Traversal-Footnote-1852732 +Node: Array Sorting Functions852850 +Ref: Array Sorting Functions-Footnote-1857941 +Node: Two-way I/O858137 +Ref: Two-way I/O-Footnote-1865858 +Ref: Two-way I/O-Footnote-2866045 +Node: TCP/IP Networking866127 +Node: Profiling869245 +Node: Advanced Features Summary878559 +Node: Internationalization880403 +Node: I18N and L10N881883 +Node: Explaining gettext882570 +Ref: Explaining gettext-Footnote-1888462 +Ref: Explaining gettext-Footnote-2888647 +Node: Programmer i18n888812 +Ref: Programmer i18n-Footnote-1893761 +Node: Translator i18n893810 +Node: String Extraction894604 +Ref: String Extraction-Footnote-1895736 +Node: Printf Ordering895822 +Ref: Printf Ordering-Footnote-1898608 +Node: I18N Portability898672 +Ref: I18N Portability-Footnote-1901128 +Node: I18N Example901191 +Ref: I18N Example-Footnote-1904466 +Ref: I18N Example-Footnote-2904539 +Node: Gawk I18N904648 +Node: I18N Summary905297 +Node: Debugger906638 +Node: Debugging907638 +Node: Debugging Concepts908079 +Node: Debugging Terms909888 +Node: Awk Debugging912463 +Ref: Awk Debugging-Footnote-1913408 +Node: Sample Debugging Session913540 +Node: Debugger Invocation914074 +Node: Finding The Bug915460 +Node: List of Debugger Commands921934 +Node: Breakpoint Control923267 +Node: Debugger Execution Control926961 +Node: Viewing And Changing Data930323 +Node: Execution Stack933864 +Node: Debugger Info935501 +Node: Miscellaneous Debugger Commands939572 +Node: Readline Support944634 +Node: Limitations945530 +Node: Debugging Summary948084 +Node: Namespaces949363 +Node: Global Namespace950474 +Node: Qualified Names951872 +Node: Default Namespace952871 +Node: Changing The Namespace953612 +Node: Naming Rules955226 +Node: Internal Name Management957074 +Node: Namespace Example958116 +Node: Namespace And Features960678 +Node: Namespace Summary962113 +Node: Arbitrary Precision Arithmetic963590 +Node: Computer Arithmetic965077 +Ref: table-numeric-ranges968843 +Ref: table-floating-point-ranges969336 +Ref: Computer Arithmetic-Footnote-1969994 +Node: Math Definitions970051 +Ref: table-ieee-formats973367 +Ref: Math Definitions-Footnote-1973970 +Node: MPFR features974075 +Node: FP Math Caution975793 +Ref: FP Math Caution-Footnote-1976865 +Node: Inexactness of computations977234 +Node: Inexact representation978194 +Node: Comparing FP Values979554 +Node: Errors accumulate980795 +Node: Getting Accuracy982228 +Node: Try To Round984938 +Node: Setting precision985837 +Ref: table-predefined-precision-strings986534 +Node: Setting the rounding mode988364 +Ref: table-gawk-rounding-modes988738 +Ref: Setting the rounding mode-Footnote-1992669 +Node: Arbitrary Precision Integers992848 +Ref: Arbitrary Precision Integers-Footnote-1996023 +Node: Checking for MPFR996172 +Node: POSIX Floating Point Problems997646 +Ref: POSIX Floating Point Problems-Footnote-11001931 +Node: Floating point summary1001969 +Node: Dynamic Extensions1004159 +Node: Extension Intro1005712 +Node: Plugin License1006978 +Node: Extension Mechanism Outline1007775 +Ref: figure-load-extension1008214 +Ref: figure-register-new-function1009779 +Ref: figure-call-new-function1010871 +Node: Extension API Description1012933 +Node: Extension API Functions Introduction1014646 +Ref: table-api-std-headers1016482 +Node: General Data Types1020731 +Ref: General Data Types-Footnote-11029361 +Node: Memory Allocation Functions1029660 +Ref: Memory Allocation Functions-Footnote-11034161 +Node: Constructor Functions1034260 +Node: API Ownership of MPFR and GMP Values1037726 +Node: Registration Functions1039039 +Node: Extension Functions1039739 +Node: Exit Callback Functions1045061 +Node: Extension Version String1046311 +Node: Input Parsers1046974 +Node: Output Wrappers1059695 +Node: Two-way processors1064207 +Node: Printing Messages1066472 +Ref: Printing Messages-Footnote-11067643 +Node: Updating ERRNO1067796 +Node: Requesting Values1068535 +Ref: table-value-types-returned1069272 +Node: Accessing Parameters1070208 +Node: Symbol Table Access1071445 +Node: Symbol table by name1071957 +Ref: Symbol table by name-Footnote-11074981 +Node: Symbol table by cookie1075109 +Ref: Symbol table by cookie-Footnote-11079294 +Node: Cached values1079358 +Ref: Cached values-Footnote-11082894 +Node: Array Manipulation1083047 +Ref: Array Manipulation-Footnote-11084138 +Node: Array Data Types1084175 +Ref: Array Data Types-Footnote-11086833 +Node: Array Functions1086925 +Node: Flattening Arrays1091423 +Node: Creating Arrays1098399 +Node: Redirection API1103166 +Node: Extension API Variables1105999 +Node: Extension Versioning1106710 +Ref: gawk-api-version1107139 +Node: Extension GMP/MPFR Versioning1108870 +Node: Extension API Informational Variables1110498 +Node: Extension API Boilerplate1111571 +Node: Changes from API V11115545 +Node: Finding Extensions1117117 +Node: Extension Example1117676 +Node: Internal File Description1118474 +Node: Internal File Ops1122554 +Ref: Internal File Ops-Footnote-11133904 +Node: Using Internal File Ops1134044 +Ref: Using Internal File Ops-Footnote-11136427 +Node: Extension Samples1136701 +Node: Extension Sample File Functions1138230 +Node: Extension Sample Fnmatch1145879 +Node: Extension Sample Fork1147366 +Node: Extension Sample Inplace1148584 +Node: Extension Sample Ord1152209 +Node: Extension Sample Readdir1153045 +Ref: table-readdir-file-types1153934 +Node: Extension Sample Revout1155001 +Node: Extension Sample Rev2way1155590 +Node: Extension Sample Read write array1156330 +Node: Extension Sample Readfile1158272 +Node: Extension Sample Time1159367 +Node: Extension Sample API Tests1161119 +Node: gawkextlib1161611 +Node: Extension summary1164529 +Node: Extension Exercises1168231 +Node: Language History1169473 +Node: V7/SVR3.11171129 +Node: SVR41173281 +Node: POSIX1174715 +Node: BTL1176096 +Node: POSIX/GNU1176825 +Node: Feature History1182603 +Node: Common Extensions1198922 +Node: Ranges and Locales1200205 +Ref: Ranges and Locales-Footnote-11204821 +Ref: Ranges and Locales-Footnote-21204848 +Ref: Ranges and Locales-Footnote-31205083 +Node: Contributors1205306 +Node: History summary1211303 +Node: Installation1212683 +Node: Gawk Distribution1213627 +Node: Getting1214111 +Node: Extracting1215074 +Node: Distribution contents1216712 +Node: Unix Installation1223192 +Node: Quick Installation1223874 +Node: Shell Startup Files1226288 +Node: Additional Configuration Options1227377 +Node: Configuration Philosophy1229692 +Node: Non-Unix Installation1232061 +Node: PC Installation1232521 +Node: PC Binary Installation1233359 +Node: PC Compiling1233794 +Node: PC Using1234911 +Node: Cygwin1238464 +Node: MSYS1239688 +Node: VMS Installation1240290 +Node: VMS Compilation1241081 +Ref: VMS Compilation-Footnote-11242310 +Node: VMS Dynamic Extensions1242368 +Node: VMS Installation Details1244053 +Node: VMS Running1246306 +Node: VMS GNV1250585 +Node: VMS Old Gawk1251320 +Node: Bugs1251791 +Node: Bug address1252454 +Node: Usenet1255436 +Node: Maintainers1256440 +Node: Other Versions1257625 +Node: Installation summary1264713 +Node: Notes1265922 +Node: Compatibility Mode1266716 +Node: Additions1267498 +Node: Accessing The Source1268423 +Node: Adding Code1269860 +Node: New Ports1276079 +Node: Derived Files1280454 +Ref: Derived Files-Footnote-11286114 +Ref: Derived Files-Footnote-21286149 +Ref: Derived Files-Footnote-31286747 +Node: Future Extensions1286861 +Node: Implementation Limitations1287519 +Node: Extension Design1288729 +Node: Old Extension Problems1289873 +Ref: Old Extension Problems-Footnote-11291391 +Node: Extension New Mechanism Goals1291448 +Ref: Extension New Mechanism Goals-Footnote-11294812 +Node: Extension Other Design Decisions1295001 +Node: Extension Future Growth1297114 +Node: Notes summary1297720 +Node: Basic Concepts1298878 +Node: Basic High Level1299559 +Ref: figure-general-flow1299841 +Ref: figure-process-flow1300526 +Ref: Basic High Level-Footnote-11303827 +Node: Basic Data Typing1304012 +Node: Glossary1307340 +Node: Copying1339225 +Node: GNU Free Documentation License1376768 +Node: Index1401888 End Tag Table diff --git a/doc/gawk.texi b/doc/gawk.texi index 90146b9f..4f3f67d5 100644 --- a/doc/gawk.texi +++ b/doc/gawk.texi @@ -26013,45 +26013,64 @@ so that the rest of the code will work as expected: @node Split Program @subsection Splitting a Large File into Pieces -@c FIXME: One day, update to current POSIX version of split - @cindex files @subentry splitting @cindex @code{split} utility -The @command{split} program splits large text files into smaller pieces. -Usage is as follows:@footnote{This is the traditional usage. The -POSIX usage is different, but not relevant for what the program -aims to demonstrate.} +The @command{split} utility splits large text files into smaller pieces. +The usage follows the POSIX standard for @command{split} and is as follows: @display -@command{split} [@code{-@var{count}}] [@var{file}] [@var{prefix}] +@command{split} [@option{-l} @var{count}] [@option{-a} @var{suffix-len}] [@var{file} [@var{outname}]] +@command{split} @option{-b} @var{N}[@code{k}|@code{m}]] [@option{-a} @var{suffix-len}] [@var{file} [@var{outname}]] @end display -By default, -the output files are named @file{xaa}, @file{xab}, and so on. Each file has -1,000 lines in it, with the likely exception of the last file. To change the -number of lines in each file, supply a number on the command line -preceded with a minus sign (e.g., @samp{-500} for files with 500 lines in them -instead of 1,000). To change the names of the output files to something like -@file{myfileaa}, @file{myfileab}, and so on, supply an additional -argument that specifies the @value{FN} prefix. - -Here is a version of @command{split} in @command{awk}. It uses the -@code{ord()} and @code{chr()} functions presented in -@ref{Ordinal Functions}. - -The program first sets its defaults, and then tests to make sure there are -not too many arguments. It then looks at each argument in turn. The -first argument could be a minus sign followed by a number. If it is, this happens -to look like a negative number, so it is made positive, and that is the -count of lines. The @value{DF} name is skipped over and the final argument -is used as the prefix for the output @value{FN}s: +By default, the output files are named @file{xaa}, @file{xab}, and so +on. Each file has 1,000 lines in it, with the likely exception of the +last file. + +The @command{split} program has evolved over time, and the current POSIX +version is more complicated than the original Unix version. The options +and what they do are as follows: + +@table @asis +@item @option{-a} @var{suffix-len} +Use @var{suffix-len} characters for the suffix. For example, if @var{suffix-len} +is four, the output files would range from @file{xaaaa} to @file{xzzzz}. + +@item @option{-b} @var{N}[@code{k}|@code{m}]] +Instead of each file containing a specified number of lines, each file +should have (at most) @var{N} bytes. Supplying a trailing @samp{k} +multiplies @var{N} by 1,024, yielding kilobytes. Supplying a trailing +@samp{m} mutiplies @var{N} by 1,048,576 (@math{1,024 @value{TIMES} 1,024}) +yielding megabytes. (This option is mutually exclusive with @option{-l}). + +@item @option{-l} @var{count} +Each file should have at most @var{count} lines, instead of the default +1,000. (This option is mutually exclusive with @option{-b}). +@end table + +If supplied, @var{file} is the input file to read. Otherwise standard +input is processed. If supplied, @var{outname} is the leading prefix +to use for @value{FN}s, instead of @samp{x}. + +In order to use the @option{-b} option, @command{gawk} should be invoked +with its @option{-b} option (@pxref{Options}), or with the environment +variable @env{LC_ALL} set to @samp{C}, so that each input byte is treated +as a separate character.@footnote{Using @option{-b} twice requires +separating @command{gawk}'s options from those of the program. For example: +@samp{gawk -f getopt.awk -f split.awk -b -- -b 42m large-file.txt split-}.} + +Here is an implementation of @command{split} in @command{awk}. It uses the +@code{getopt()} function presented in @ref{Getopt Function}. + +The program begins with a standard descriptive comment and then +a @code{usage()} function describing the options: @cindex @code{split.awk} program @example @c file eg/prog/split.awk # split.awk --- do split in awk # -# Requires ord() and chr() library functions +# Requires getopt() library function. @c endfile @ignore @c file eg/prog/split.awk @@ -26059,100 +26078,277 @@ is used as the prefix for the output @value{FN}s: # Arnold Robbins, arnold@@skeeve.com, Public Domain # May 1993 # Revised slightly, May 2014 +# Rewritten September 2020 @c endfile @end ignore @c file eg/prog/split.awk -# usage: split [-count] [file] [outname] +function usage() +@{ + print("usage: split [-l count] [-a suffix-len] [file [outname]]") > "/dev/stderr" + print(" split [-b N[k|m]] [-a suffix-len] [file [outname]]") > "/dev/stderr" + exit 1 +@} +@c endfile +@end example + +Next, in a @code{BEGIN} rule we set the default values and parse the arguments. +After that we initialize the data structures used to cycle the suffix +from @samp{aa@dots{}} to @samp{zz@dots{}}. Finally we set the name of +the first output file: +@example +@c file eg/prog/split.awk BEGIN @{ - outfile = "x" # default - count = 1000 - if (ARGC > 4) - usage() + # Set defaults: + Suffix_length = 2 + Line_count = 1000 + Byte_count = 0 + Outfile = "x" - i = 1 - if (i in ARGV && ARGV[i] ~ /^-[[:digit:]]+$/) @{ - count = -ARGV[i] - ARGV[i] = "" - i++ + parse_arguments() + + init_suffix_data() + + Output = (Outfile compute_suffix()) +@} +@c endfile +@end example + +Parsing the arguments is straightforward. The program follows our +convention (@pxref{Library Names}) of having important global variables +start with an uppercase letter: + +@example +@c file eg/prog/split.awk +function parse_arguments( i, c, l, modifier) +@{ + while ((c = getopt(ARGC, ARGV, "a:b:l:")) != -1) @{ + if (c == "a") + Suffix_length = Optarg + 0 + else if (c == "b") @{ + Byte_count = Optarg + 0 + Line_count = 0 + + l = length(Optarg) + modifier = substr(Optarg, l, 1) + if (modifier == "k") + Byte_count *= 1024 + else if (modifier == "m") + Byte_count *= 1024 * 1024 + @} else if (c == "l") @{ + Line_count = Optarg + 0 + Byte_count = 0 + @} else + usage() @} - # test argv in case reading from stdin instead of file - if (i in ARGV) - i++ # skip datafile name -@group - if (i in ARGV) @{ - outfile = ARGV[i] + + # Clear out options + for (i = 1; i < Optind; i++) ARGV[i] = "" + + # Check for filename + if (ARGV[Optind]) @{ + Optind++ + + # Check for different prefix + if (ARGV[Optind]) @{ + Outfile = ARGV[Optind] + ARGV[Optind] = "" + + if (++Optind < ARGC) + usage() + @} @} -@end group -@group - s1 = s2 = "a" - out = (outfile s1 s2) @} -@end group @c endfile @end example -The next rule does most of the work. @code{tcount} (temporary count) tracks -how many lines have been printed to the output file so far. If it is greater -than @code{count}, it is time to close the current file and start a new one. -@code{s1} and @code{s2} track the current suffixes for the @value{FN}. If -they are both @samp{z}, the file is just too big. Otherwise, @code{s1} -moves to the next letter in the alphabet and @code{s2} starts over again at -@samp{a}: +Managing the @value{FN} suffix is interesting. +Given a suffix of length three, say, the values go from +@samp{aaa}, @samp{aab}, @samp{aac} and so on, all the way to +@samp{zzx}, @samp{zzy}, and finally @samp{zzz}. +There are two important aspects to this: + +@itemize @bullet +@item +We have to be +able to easily generate these suffixes, and in particular +easily handle ``rolling over''; for example, going from +@samp{abz} to @samp{aca}. + +@item +We have to tell when we've finished with the last file, +so that if we still have more input data we can print an +error message and exit. The trick is to handle this @emph{after} +using the last suffix, and not when the final suffix is created. +@end itemize + +The computation is handled by @code{compute_suffix()}. +This function is called every time a new file is opened. + +The flow here is messy, because we want to generate @samp{zzzz} (say), +and use it, and only produce an error after all the @value{FN} +suffixes have been used up. The logical steps are as follows: + +@enumerate 1 +@item +Generate the suffix, saving the value in @code{result} to return. +To do this, the supplementary array @code{Suffix_ind} contains one +element for each letter in the suffix. Each element ranges from 1 to +26, acting as the index into a string containing all the lowercase +letters of the English alphabet. +It is initialized by @code{init_suffix_data()}. +@code{result} is built up one letter at a time, using each @code{substr()}. + +@item +Prepare the data structures for the next time @code{compute_suffix()} +is called. To do this, we loop over @code{Suffix_ind}, @emph{backwards}. +If the current element is less than 26, it's incremented and the loop +breaks (@samp{abq} goes to @samp{abr}). Otherwise, the element is +reset to one and we move down the list (@samp{abz} to @samp{aca}). +Thus, the @code{Suffix_ind} array is always ``one step ahead'' of the actual +@value{FN} suffix to be returned. + +@item +Check if we've gone past the limit of possible filenames. +If @code{Reached_last} is true, print a message and exit. Otherwise, +check if @code{Suffix_ind} describes a suffix where all the letters are +@samp{z}. If that's the case we're about to return the final suffix. If +so, we set @code{Reached_last} to true so that the @emph{next} call to +@code{compute_suffix()} will cause a failure. +@end enumerate + +Physically, the steps in the function occur in the order 3, 1, 2: -@c else on separate line here for page breaking @example @c file eg/prog/split.awk +function compute_suffix( i, result, letters) @{ - if (++tcount > count) @{ - close(out) - if (s2 == "z") @{ - if (s1 == "z") @{ - printf("split: %s is too large to split\n", - FILENAME) > "/dev/stderr" - exit 1 - @} - s1 = chr(ord(s1) + 1) - s2 = "a" - @} -@group - else - s2 = chr(ord(s2) + 1) -@end group - out = (outfile s1 s2) - tcount = 1 + # Logical step 3 + if (Reached_last) @{ + printf("split: too many files!\n") > "/dev/stderr" + exit 1 + @} else if (on_last_file()) + Reached_last = 1 # fail when wrapping after 'zzz' + + # Logical step 1 + result = "" + letters = "abcdefghijklmnopqrstuvwxyz" + for (i = 1; i <= Suffix_length; i++) + result = result substr(letters, Suffix_ind[i], 1) + + # Logical step 2 + for (i = Suffix_length; i >= 1; i--) @{ + if (++Suffix_ind[i] > 26) @{ + Suffix_ind[i] = 1 + @} else + break @} - print > out + + return result @} @c endfile @end example -@noindent -The @code{usage()} function simply prints an error message and exits: +The @code{Suffix_ind} array and @code{Reached_last} are initialized +by @code{init_suffix_data()}: @example @c file eg/prog/split.awk -function usage() +function init_suffix_data( i) @{ - print("usage: split [-num] [file] [outname]") > "/dev/stderr" - exit 1 + for (i = 1; i <= Suffix_length; i++) + Suffix_ind[i] = 1 + + Reached_last = 0 @} @c endfile @end example -This program is a bit sloppy; it relies on @command{awk} to automatically close the last file -instead of doing it in an @code{END} rule. -It also assumes that letters are contiguous in the character set, -which isn't true for EBCDIC systems. +The function @code{on_last_file()} returns true if @code{Suffix_ind} describes +a suffix where all the letters are @samp{z} by checking that all the elements +in the array are equal to 26: -@ifset FOR_PRINT -You might want to consider how to eliminate the use of -@code{ord()} and @code{chr()}; this can be done in such a -way as to solve the EBCDIC issue as well. -@end ifset +@example +@c file eg/prog/split.awk +function on_last_file( i, on_last) +@{ + on_last = 1 + for (i = 1; i <= Suffix_length; i++) @{ + on_last = on_last && (Suffix_ind[i] == 26) + @} + + return on_last +@} +@c endfile +@end example + +The actual work of splitting the input file is done by the next two rules. +Since splitting by line count and splitting by byte count are mutually +exclusive, we simply use two separate rules, one for when @code{Line_count} +is greater than zero, and another for when @code{Byte_count} is greater than zero. + +The variable @code{tcount} counts how many lines have been processed so far. +When it exceeds @code{Line_count}, it's time to close the previous file and +switch to a new one: + +@example +@c file eg/prog/split.awk +Line_count > 0 @{ + if (++tcount > Line_count) @{ + close(Output) + Output = (Outfile compute_suffix()) + tcount = 1 + @} + print > Output +@} +@c endfile +@end example + +The rule for handling bytes is more complicated. Since lines most likely +vary in length, the @code{Byte_count} boundary may be hit in the middle of +an input record. In that case, @command{split} has to write enough of the +first bytes of the input record to finish up @code{Byte_count} bytes, close +the file, open a new file, and write the rest of the record to the new file. +The logic here does all that: + +@example +@c file eg/prog/split.awk +Byte_count > 0 @{ + # `+ 1' is for the final newline + if (tcount + length($0) + 1 > Byte_count) @{ # would overflow + # compute leading bytes + leading_bytes = Byte_count - tcount + + # write leading bytes + printf("%s", substr($0, 1, leading_bytes)) > Output + + # close old file, open new file + close(Output) + Output = (Outfile compute_suffix()) + + # set up first bytes for new file + $0 = substr($0, leading_bytes + 1) # trailing bytes + tcount = 0 + @} + + # write full record or trailing bytes + tcount += length($0) + 1 + print > Output +@} +@c endfile +@end example +Finally, the @code{END} rule cleans up by closing the last output file: + +@example +@c file eg/prog/split.awk +END @{ + close(Output) +@} +@c endfile +@end example @node Tee Program @subsection Duplicating Output into Multiple Files diff --git a/doc/gawktexi.in b/doc/gawktexi.in index ae1d0bc4..f77d071d 100644 --- a/doc/gawktexi.in +++ b/doc/gawktexi.in @@ -25023,45 +25023,64 @@ so that the rest of the code will work as expected: @node Split Program @subsection Splitting a Large File into Pieces -@c FIXME: One day, update to current POSIX version of split - @cindex files @subentry splitting @cindex @code{split} utility -The @command{split} program splits large text files into smaller pieces. -Usage is as follows:@footnote{This is the traditional usage. The -POSIX usage is different, but not relevant for what the program -aims to demonstrate.} +The @command{split} utility splits large text files into smaller pieces. +The usage follows the POSIX standard for @command{split} and is as follows: @display -@command{split} [@code{-@var{count}}] [@var{file}] [@var{prefix}] +@command{split} [@option{-l} @var{count}] [@option{-a} @var{suffix-len}] [@var{file} [@var{outname}]] +@command{split} @option{-b} @var{N}[@code{k}|@code{m}]] [@option{-a} @var{suffix-len}] [@var{file} [@var{outname}]] @end display -By default, -the output files are named @file{xaa}, @file{xab}, and so on. Each file has -1,000 lines in it, with the likely exception of the last file. To change the -number of lines in each file, supply a number on the command line -preceded with a minus sign (e.g., @samp{-500} for files with 500 lines in them -instead of 1,000). To change the names of the output files to something like -@file{myfileaa}, @file{myfileab}, and so on, supply an additional -argument that specifies the @value{FN} prefix. - -Here is a version of @command{split} in @command{awk}. It uses the -@code{ord()} and @code{chr()} functions presented in -@ref{Ordinal Functions}. - -The program first sets its defaults, and then tests to make sure there are -not too many arguments. It then looks at each argument in turn. The -first argument could be a minus sign followed by a number. If it is, this happens -to look like a negative number, so it is made positive, and that is the -count of lines. The @value{DF} name is skipped over and the final argument -is used as the prefix for the output @value{FN}s: +By default, the output files are named @file{xaa}, @file{xab}, and so +on. Each file has 1,000 lines in it, with the likely exception of the +last file. + +The @command{split} program has evolved over time, and the current POSIX +version is more complicated than the original Unix version. The options +and what they do are as follows: + +@table @asis +@item @option{-a} @var{suffix-len} +Use @var{suffix-len} characters for the suffix. For example, if @var{suffix-len} +is four, the output files would range from @file{xaaaa} to @file{xzzzz}. + +@item @option{-b} @var{N}[@code{k}|@code{m}]] +Instead of each file containing a specified number of lines, each file +should have (at most) @var{N} bytes. Supplying a trailing @samp{k} +multiplies @var{N} by 1,024, yielding kilobytes. Supplying a trailing +@samp{m} mutiplies @var{N} by 1,048,576 (@math{1,024 @value{TIMES} 1,024}) +yielding megabytes. (This option is mutually exclusive with @option{-l}). + +@item @option{-l} @var{count} +Each file should have at most @var{count} lines, instead of the default +1,000. (This option is mutually exclusive with @option{-b}). +@end table + +If supplied, @var{file} is the input file to read. Otherwise standard +input is processed. If supplied, @var{outname} is the leading prefix +to use for @value{FN}s, instead of @samp{x}. + +In order to use the @option{-b} option, @command{gawk} should be invoked +with its @option{-b} option (@pxref{Options}), or with the environment +variable @env{LC_ALL} set to @samp{C}, so that each input byte is treated +as a separate character.@footnote{Using @option{-b} twice requires +separating @command{gawk}'s options from those of the program. For example: +@samp{gawk -f getopt.awk -f split.awk -b -- -b 42m large-file.txt split-}.} + +Here is an implementation of @command{split} in @command{awk}. It uses the +@code{getopt()} function presented in @ref{Getopt Function}. + +The program begins with a standard descriptive comment and then +a @code{usage()} function describing the options: @cindex @code{split.awk} program @example @c file eg/prog/split.awk # split.awk --- do split in awk # -# Requires ord() and chr() library functions +# Requires getopt() library function. @c endfile @ignore @c file eg/prog/split.awk @@ -25069,100 +25088,277 @@ is used as the prefix for the output @value{FN}s: # Arnold Robbins, arnold@@skeeve.com, Public Domain # May 1993 # Revised slightly, May 2014 +# Rewritten September 2020 @c endfile @end ignore @c file eg/prog/split.awk -# usage: split [-count] [file] [outname] +function usage() +@{ + print("usage: split [-l count] [-a suffix-len] [file [outname]]") > "/dev/stderr" + print(" split [-b N[k|m]] [-a suffix-len] [file [outname]]") > "/dev/stderr" + exit 1 +@} +@c endfile +@end example + +Next, in a @code{BEGIN} rule we set the default values and parse the arguments. +After that we initialize the data structures used to cycle the suffix +from @samp{aa@dots{}} to @samp{zz@dots{}}. Finally we set the name of +the first output file: +@example +@c file eg/prog/split.awk BEGIN @{ - outfile = "x" # default - count = 1000 - if (ARGC > 4) - usage() + # Set defaults: + Suffix_length = 2 + Line_count = 1000 + Byte_count = 0 + Outfile = "x" - i = 1 - if (i in ARGV && ARGV[i] ~ /^-[[:digit:]]+$/) @{ - count = -ARGV[i] - ARGV[i] = "" - i++ + parse_arguments() + + init_suffix_data() + + Output = (Outfile compute_suffix()) +@} +@c endfile +@end example + +Parsing the arguments is straightforward. The program follows our +convention (@pxref{Library Names}) of having important global variables +start with an uppercase letter: + +@example +@c file eg/prog/split.awk +function parse_arguments( i, c, l, modifier) +@{ + while ((c = getopt(ARGC, ARGV, "a:b:l:")) != -1) @{ + if (c == "a") + Suffix_length = Optarg + 0 + else if (c == "b") @{ + Byte_count = Optarg + 0 + Line_count = 0 + + l = length(Optarg) + modifier = substr(Optarg, l, 1) + if (modifier == "k") + Byte_count *= 1024 + else if (modifier == "m") + Byte_count *= 1024 * 1024 + @} else if (c == "l") @{ + Line_count = Optarg + 0 + Byte_count = 0 + @} else + usage() @} - # test argv in case reading from stdin instead of file - if (i in ARGV) - i++ # skip datafile name -@group - if (i in ARGV) @{ - outfile = ARGV[i] + + # Clear out options + for (i = 1; i < Optind; i++) ARGV[i] = "" + + # Check for filename + if (ARGV[Optind]) @{ + Optind++ + + # Check for different prefix + if (ARGV[Optind]) @{ + Outfile = ARGV[Optind] + ARGV[Optind] = "" + + if (++Optind < ARGC) + usage() + @} @} -@end group -@group - s1 = s2 = "a" - out = (outfile s1 s2) @} -@end group @c endfile @end example -The next rule does most of the work. @code{tcount} (temporary count) tracks -how many lines have been printed to the output file so far. If it is greater -than @code{count}, it is time to close the current file and start a new one. -@code{s1} and @code{s2} track the current suffixes for the @value{FN}. If -they are both @samp{z}, the file is just too big. Otherwise, @code{s1} -moves to the next letter in the alphabet and @code{s2} starts over again at -@samp{a}: +Managing the @value{FN} suffix is interesting. +Given a suffix of length three, say, the values go from +@samp{aaa}, @samp{aab}, @samp{aac} and so on, all the way to +@samp{zzx}, @samp{zzy}, and finally @samp{zzz}. +There are two important aspects to this: + +@itemize @bullet +@item +We have to be +able to easily generate these suffixes, and in particular +easily handle ``rolling over''; for example, going from +@samp{abz} to @samp{aca}. + +@item +We have to tell when we've finished with the last file, +so that if we still have more input data we can print an +error message and exit. The trick is to handle this @emph{after} +using the last suffix, and not when the final suffix is created. +@end itemize + +The computation is handled by @code{compute_suffix()}. +This function is called every time a new file is opened. + +The flow here is messy, because we want to generate @samp{zzzz} (say), +and use it, and only produce an error after all the @value{FN} +suffixes have been used up. The logical steps are as follows: + +@enumerate 1 +@item +Generate the suffix, saving the value in @code{result} to return. +To do this, the supplementary array @code{Suffix_ind} contains one +element for each letter in the suffix. Each element ranges from 1 to +26, acting as the index into a string containing all the lowercase +letters of the English alphabet. +It is initialized by @code{init_suffix_data()}. +@code{result} is built up one letter at a time, using each @code{substr()}. + +@item +Prepare the data structures for the next time @code{compute_suffix()} +is called. To do this, we loop over @code{Suffix_ind}, @emph{backwards}. +If the current element is less than 26, it's incremented and the loop +breaks (@samp{abq} goes to @samp{abr}). Otherwise, the element is +reset to one and we move down the list (@samp{abz} to @samp{aca}). +Thus, the @code{Suffix_ind} array is always ``one step ahead'' of the actual +@value{FN} suffix to be returned. + +@item +Check if we've gone past the limit of possible filenames. +If @code{Reached_last} is true, print a message and exit. Otherwise, +check if @code{Suffix_ind} describes a suffix where all the letters are +@samp{z}. If that's the case we're about to return the final suffix. If +so, we set @code{Reached_last} to true so that the @emph{next} call to +@code{compute_suffix()} will cause a failure. +@end enumerate + +Physically, the steps in the function occur in the order 3, 1, 2: -@c else on separate line here for page breaking @example @c file eg/prog/split.awk +function compute_suffix( i, result, letters) @{ - if (++tcount > count) @{ - close(out) - if (s2 == "z") @{ - if (s1 == "z") @{ - printf("split: %s is too large to split\n", - FILENAME) > "/dev/stderr" - exit 1 - @} - s1 = chr(ord(s1) + 1) - s2 = "a" - @} -@group - else - s2 = chr(ord(s2) + 1) -@end group - out = (outfile s1 s2) - tcount = 1 + # Logical step 3 + if (Reached_last) @{ + printf("split: too many files!\n") > "/dev/stderr" + exit 1 + @} else if (on_last_file()) + Reached_last = 1 # fail when wrapping after 'zzz' + + # Logical step 1 + result = "" + letters = "abcdefghijklmnopqrstuvwxyz" + for (i = 1; i <= Suffix_length; i++) + result = result substr(letters, Suffix_ind[i], 1) + + # Logical step 2 + for (i = Suffix_length; i >= 1; i--) @{ + if (++Suffix_ind[i] > 26) @{ + Suffix_ind[i] = 1 + @} else + break @} - print > out + + return result @} @c endfile @end example -@noindent -The @code{usage()} function simply prints an error message and exits: +The @code{Suffix_ind} array and @code{Reached_last} are initialized +by @code{init_suffix_data()}: @example @c file eg/prog/split.awk -function usage() +function init_suffix_data( i) @{ - print("usage: split [-num] [file] [outname]") > "/dev/stderr" - exit 1 + for (i = 1; i <= Suffix_length; i++) + Suffix_ind[i] = 1 + + Reached_last = 0 @} @c endfile @end example -This program is a bit sloppy; it relies on @command{awk} to automatically close the last file -instead of doing it in an @code{END} rule. -It also assumes that letters are contiguous in the character set, -which isn't true for EBCDIC systems. +The function @code{on_last_file()} returns true if @code{Suffix_ind} describes +a suffix where all the letters are @samp{z} by checking that all the elements +in the array are equal to 26: -@ifset FOR_PRINT -You might want to consider how to eliminate the use of -@code{ord()} and @code{chr()}; this can be done in such a -way as to solve the EBCDIC issue as well. -@end ifset +@example +@c file eg/prog/split.awk +function on_last_file( i, on_last) +@{ + on_last = 1 + for (i = 1; i <= Suffix_length; i++) @{ + on_last = on_last && (Suffix_ind[i] == 26) + @} + + return on_last +@} +@c endfile +@end example + +The actual work of splitting the input file is done by the next two rules. +Since splitting by line count and splitting by byte count are mutually +exclusive, we simply use two separate rules, one for when @code{Line_count} +is greater than zero, and another for when @code{Byte_count} is greater than zero. + +The variable @code{tcount} counts how many lines have been processed so far. +When it exceeds @code{Line_count}, it's time to close the previous file and +switch to a new one: + +@example +@c file eg/prog/split.awk +Line_count > 0 @{ + if (++tcount > Line_count) @{ + close(Output) + Output = (Outfile compute_suffix()) + tcount = 1 + @} + print > Output +@} +@c endfile +@end example + +The rule for handling bytes is more complicated. Since lines most likely +vary in length, the @code{Byte_count} boundary may be hit in the middle of +an input record. In that case, @command{split} has to write enough of the +first bytes of the input record to finish up @code{Byte_count} bytes, close +the file, open a new file, and write the rest of the record to the new file. +The logic here does all that: + +@example +@c file eg/prog/split.awk +Byte_count > 0 @{ + # `+ 1' is for the final newline + if (tcount + length($0) + 1 > Byte_count) @{ # would overflow + # compute leading bytes + leading_bytes = Byte_count - tcount + + # write leading bytes + printf("%s", substr($0, 1, leading_bytes)) > Output + + # close old file, open new file + close(Output) + Output = (Outfile compute_suffix()) + + # set up first bytes for new file + $0 = substr($0, leading_bytes + 1) # trailing bytes + tcount = 0 + @} + + # write full record or trailing bytes + tcount += length($0) + 1 + print > Output +@} +@c endfile +@end example +Finally, the @code{END} rule cleans up by closing the last output file: + +@example +@c file eg/prog/split.awk +END @{ + close(Output) +@} +@c endfile +@end example @node Tee Program @subsection Duplicating Output into Multiple Files diff --git a/doc/wordlist b/doc/wordlist index d3aea0be..40f55da6 100644 --- a/doc/wordlist +++ b/doc/wordlist @@ -358,6 +358,7 @@ Oram Ord Ormos Ou +Outfile Oy PASSWD PATCHLEVEL @@ -517,11 +518,13 @@ aB aCa aIt aSAgbfocFVtguG +aa aaa aaaabcd aab aabbb aabbbccccddd +aac aaccdd ab abCDEf @@ -534,9 +537,13 @@ abcdefg abcdefghi abcdefghijklmnopqrstuvwxyz abcdxyz +abq +abr abs abx +abz ac +aca acbfoo aclocal addrs @@ -1768,6 +1775,7 @@ xDeadBeef xFOO xX xaa +xaaaa xab xbd xdeadBEEF @@ -1796,6 +1804,7 @@ xxx xyz xyzzy xz +xzzzz ya yabber yballs @@ -1809,5 +1818,9 @@ zbcom zerofile zodiacusque zsh +zz +zzx +zzy zzz +zzzz zzzzzz |