diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2017-04-12 12:37:00 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2017-04-12 12:37:00 +0300 |
commit | c8d82e842e7ad94cae039e6c978b4bea1a31c4d5 (patch) | |
tree | b5da46c4c3e0f96683a42c6db5a32ea96aa854e3 | |
parent | 8f83ab76a1d8861d9a992290f2691443d5169c89 (diff) | |
download | egawk-c8d82e842e7ad94cae039e6c978b4bea1a31c4d5.tar.gz egawk-c8d82e842e7ad94cae039e6c978b4bea1a31c4d5.tar.bz2 egawk-c8d82e842e7ad94cae039e6c978b4bea1a31c4d5.zip |
Fix FPAT problems.
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | awk.h | 1 | ||||
-rw-r--r-- | doc/ChangeLog | 4 | ||||
-rw-r--r-- | doc/gawk.1 | 8 | ||||
-rw-r--r-- | doc/gawk.info | 742 | ||||
-rw-r--r-- | doc/gawk.texi | 10 | ||||
-rw-r--r-- | doc/gawktexi.in | 10 | ||||
-rw-r--r-- | field.c | 249 | ||||
-rw-r--r-- | test/ChangeLog | 7 | ||||
-rw-r--r-- | test/Makefile.am | 7 | ||||
-rw-r--r-- | test/Makefile.in | 12 | ||||
-rw-r--r-- | test/Maketests | 5 | ||||
-rw-r--r-- | test/fpat6.awk | 8 | ||||
-rw-r--r-- | test/fpat6.in | 13 | ||||
-rw-r--r-- | test/fpat6.ok | 44 | ||||
-rw-r--r-- | test/patsplit.ok | 3 |
16 files changed, 581 insertions, 550 deletions
@@ -1,3 +1,11 @@ +2017-04-12 Manuel Collado <m-collado@users.sourceforge.net> + + Fix the FPAT bug reported by Ed Morton in the gawk-bug mailing list. + + * awk.h (Regexp): Remove the non_empty flag. + * field.c (fpat_parse_field): Restructure the code to reduce complexity + and document the new structure. + 2017-04-10 Andrew J. Schorr <aschorr@telemetry-investments.com> * awk.h (enum opcodeval): For the avoidance of doubt, specify that @@ -210,7 +210,6 @@ typedef struct Regexp { struct re_pattern_buffer pat; struct re_registers regs; struct dfa *dfareg; - bool non_empty; /* for use in fpat_parse_field */ bool has_meta; /* re has meta chars so (probably) isn't simple string */ bool maybe_long; /* re has meta chars that can match long text */ } Regexp; diff --git a/doc/ChangeLog b/doc/ChangeLog index 82ae2ce3..61d09e28 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,7 @@ +2017-04-12 Manuel Collado <m-collado@users.sourceforge.net> + + * gawktexi.in, gawk.1: Small clarification of the patsplit behavior. + 2017-04-11 Arnold D. Robbins <arnold@skeeve.com> * gawktexi.in: Minor style edits. @@ -2977,9 +2977,11 @@ that matched .IR r . The value of .BI seps[ i ] -is the separator that appeared in -front of -.BI a[ i +1]\fR. +is the possibly null separator that appeared after +.BI a[ i ]\fR. +The value of +.B seps[0] +is the possibly null leading separator. \&\fRIf .I r is omitted, diff --git a/doc/gawk.info b/doc/gawk.info index 42c3c197..d3ae5639 100644 --- a/doc/gawk.info +++ b/doc/gawk.info @@ -12670,16 +12670,18 @@ Options::): fatal error. 'patsplit(STRING, ARRAY' [', FIELDPAT' [', SEPS' ] ]') #' - Divide STRING into pieces defined by FIELDPAT and store the pieces - in ARRAY and the separator strings in the SEPS array. The first - piece is stored in 'ARRAY[1]', the second piece in 'ARRAY[2]', and - so forth. The third argument, FIELDPAT, is a regexp describing the - fields in STRING (just as 'FPAT' is a regexp describing the fields - in input records). It may be either a regexp constant or a string. - If FIELDPAT is omitted, the value of 'FPAT' is used. 'patsplit()' - returns the number of elements created. 'SEPS[I]' is the separator - string between 'ARRAY[I]' and 'ARRAY[I+1]'. Any leading separator - will be in 'SEPS[0]'. + Divide STRING into pieces (or "fields") defined by FIELDPAT and + store the pieces in ARRAY and the separator strings in the SEPS + array. The first piece is stored in 'ARRAY[1]', the second piece + in 'ARRAY[2]', and so forth. The third argument, FIELDPAT, is a + regexp describing the fields in STRING (just as 'FPAT' is a regexp + describing the fields in input records). It may be either a regexp + constant or a string. If FIELDPAT is omitted, the value of 'FPAT' + is used. 'patsplit()' returns the number of elements created. + 'SEPS[I]' is the possibly null separator string after 'ARRAY[I]'. + The possibly null leading separator will be in 'SEPS[0]'. So a + non-null STRING with N fields will have N+1 separators. A null + STRING will not have neither fields nor separators. The 'patsplit()' function splits strings into pieces in a manner similar to the way input lines are split into fields using 'FPAT' @@ -32477,7 +32479,7 @@ Index * * (asterisk), * operator, as regexp operator: Regexp Operators. (line 89) * * (asterisk), * operator, null strings, matching: String Functions. - (line 537) + (line 539) * * (asterisk), ** operator: Arithmetic Ops. (line 81) * * (asterisk), ** operator <1>: Precedence. (line 48) * * (asterisk), **= operator: Assignment Ops. (line 129) @@ -32871,7 +32873,7 @@ Index * asterisk (*), * operator, as regexp operator: Regexp Operators. (line 89) * asterisk (*), * operator, null strings, matching: String Functions. - (line 537) + (line 539) * asterisk (*), ** operator: Arithmetic Ops. (line 81) * asterisk (*), ** operator <1>: Precedence. (line 48) * asterisk (*), **= operator: Assignment Ops. (line 129) @@ -33094,7 +33096,7 @@ Index * Brian Kernighan's awk <8>: Continue Statement. (line 44) * Brian Kernighan's awk <9>: Nextfile Statement. (line 47) * Brian Kernighan's awk <10>: Delete. (line 51) -* Brian Kernighan's awk <11>: String Functions. (line 493) +* Brian Kernighan's awk <11>: String Functions. (line 495) * Brian Kernighan's awk <12>: Gory Details. (line 19) * Brian Kernighan's awk <13>: I/O Functions. (line 43) * Brian Kernighan's awk, extensions: BTL. (line 6) @@ -33137,7 +33139,7 @@ Index * case sensitivity, and regexps: User-modified. (line 79) * case sensitivity, and string comparisons: User-modified. (line 79) * case sensitivity, array indices and: Array Intro. (line 100) -* case sensitivity, converting case: String Functions. (line 523) +* case sensitivity, converting case: String Functions. (line 525) * case sensitivity, example programs: Library Functions. (line 53) * case sensitivity, gawk: Case-sensitivity. (line 26) * case sensitivity, regexps and: Case-sensitivity. (line 6) @@ -33265,9 +33267,9 @@ Index * control statements: Statements. (line 6) * controlling array scanning order: Controlling Scanning. (line 14) -* convert string to lower case: String Functions. (line 524) -* convert string to number: String Functions. (line 391) -* convert string to upper case: String Functions. (line 530) +* convert string to lower case: String Functions. (line 526) +* convert string to number: String Functions. (line 393) +* convert string to upper case: String Functions. (line 532) * converting integer array subscripts: Numeric Array Subscripts. (line 31) * converting, dates to timestamps: Time Functions. (line 78) @@ -33346,7 +33348,7 @@ Index (line 149) * dark corner, regexp constants, as arguments to user-defined functions: Standard Regexp Constants. (line 43) -* dark corner, split() function: String Functions. (line 361) +* dark corner, split() function: String Functions. (line 363) * dark corner, strings, storing: gawk split records. (line 82) * dark corner, value of ARGV[0]: Auto-set. (line 39) * dark corner, ^, in FS: Regexp Field Splitting. @@ -33563,7 +33565,7 @@ Index * differences in awk and gawk, single-character fields: Single Character Fields. (line 6) * differences in awk and gawk, split() function: String Functions. - (line 348) + (line 350) * differences in awk and gawk, strings: Scalar Constants. (line 20) * differences in awk and gawk, strings, storing: gawk split records. (line 76) @@ -33896,7 +33898,7 @@ Index * format time string: Time Functions. (line 50) * formats, numeric output: OFMT. (line 6) * formatting output: Printf. (line 6) -* formatting strings: String Functions. (line 384) +* formatting strings: String Functions. (line 386) * forward slash (/) to enclose regular expressions: Regexp. (line 10) * forward slash (/), / operator: Precedence. (line 54) * forward slash (/), /= operator: Assignment Ops. (line 129) @@ -34169,7 +34171,7 @@ Index * gsub: Standard Regexp Constants. (line 43) * gsub <1>: String Functions. (line 139) -* gsub() function, arguments of: String Functions. (line 463) +* gsub() function, arguments of: String Functions. (line 465) * gsub() function, escape processing: Gory Details. (line 6) * h debugger command (alias for help): Miscellaneous Debugger Commands. (line 69) @@ -34458,7 +34460,7 @@ Index * matching, expressions, See comparison expressions: Typing and Comparison. (line 9) * matching, leftmost longest: Multiple Line. (line 26) -* matching, null strings: String Functions. (line 537) +* matching, null strings: String Functions. (line 539) * mawk utility: Escape Sequences. (line 121) * mawk utility <1>: Getline/Pipe. (line 62) * mawk utility <2>: Concatenation. (line 36) @@ -34547,7 +34549,7 @@ Index (line 43) * null strings, converting numbers to strings: Strings And Numbers. (line 21) -* null strings, matching: String Functions. (line 537) +* null strings, matching: String Functions. (line 539) * number as string of bits: Bitwise Functions. (line 108) * number of array elements: String Functions. (line 200) * number sign (#), #! (executable scripts): Executable Scripts. @@ -34725,7 +34727,7 @@ Index * portability, operators: Increment Ops. (line 60) * portability, operators, not in POSIX awk: Precedence. (line 97) * portability, POSIXLY_CORRECT environment variable: Options. (line 363) -* portability, substr() function: String Functions. (line 513) +* portability, substr() function: String Functions. (line 515) * portable object files: Explaining gettext. (line 37) * portable object files <1>: Translator i18n. (line 6) * portable object files, converting to message object files: I18N Example. @@ -34976,7 +34978,7 @@ Index * regular expressions, searching for: Egrep Program. (line 6) * relational operators, See comparison operators: Typing and Comparison. (line 9) -* replace in string: String Functions. (line 409) +* replace in string: String Functions. (line 411) * retrying input: Retrying Input. (line 6) * return debugger command: Debugger Execution Control. (line 54) @@ -35144,7 +35146,7 @@ Index (line 37) * sidebar, Interactive Versus Noninteractive Buffering: I/O Functions. (line 74) -* sidebar, Matching the Null String: String Functions. (line 535) +* sidebar, Matching the Null String: String Functions. (line 537) * sidebar, Operator Evaluation Order: Increment Ops. (line 58) * sidebar, Piping into sh: Redirection. (line 134) * sidebar, Pre-POSIX awk Used OFMT for String Conversion: Strings And Numbers. @@ -35213,13 +35215,13 @@ Index * source files, search path for: Programs Exercises. (line 70) * sparse arrays: Array Intro. (line 76) * Spencer, Henry: Glossary. (line 16) -* split: String Functions. (line 315) +* split: String Functions. (line 317) * split string into array: String Functions. (line 296) * split utility: Split Program. (line 6) * split() function, array elements, deleting: Delete. (line 61) * split.awk program: Split Program. (line 30) * sprintf: OFMT. (line 15) -* sprintf <1>: String Functions. (line 384) +* sprintf <1>: String Functions. (line 386) * sprintf() function, OFMT variable and: User-modified. (line 116) * sprintf() function, print/printf statements and: Round Function. (line 6) @@ -35261,10 +35263,10 @@ Index * string-manipulation functions: String Functions. (line 6) * string-matching operators: Regexp Usage. (line 19) * string-translation functions: I18N Functions. (line 6) -* strings splitting, example: String Functions. (line 334) +* strings splitting, example: String Functions. (line 336) * strings, converting: Strings And Numbers. (line 6) * strings, converting <1>: Bitwise Functions. (line 108) -* strings, converting letter case: String Functions. (line 523) +* strings, converting letter case: String Functions. (line 525) * strings, converting, numbers to: User-modified. (line 30) * strings, converting, numbers to <1>: User-modified. (line 107) * strings, empty, See null strings: awk split records. (line 114) @@ -35275,13 +35277,13 @@ Index * strings, null: Regexp Field Splitting. (line 43) * strings, numeric: Variable Typing. (line 67) -* strtonum: String Functions. (line 391) +* strtonum: String Functions. (line 393) * strtonum() function (gawk), --non-decimal-data option and: Nondecimal Data. (line 35) * sub: Standard Regexp Constants. (line 43) -* sub <1>: String Functions. (line 409) -* sub() function, arguments of: String Functions. (line 463) +* sub <1>: String Functions. (line 411) +* sub() function, arguments of: String Functions. (line 465) * sub() function, escape processing: Gory Details. (line 6) * subscript separators: User-modified. (line 149) * subscripts in arrays, multidimensional: Multidimensional. (line 10) @@ -35295,8 +35297,8 @@ Index * SUBSEP variable, and multidimensional arrays: Multidimensional. (line 16) * substitute in string: String Functions. (line 89) -* substr: String Functions. (line 482) -* substring: String Functions. (line 482) +* substr: String Functions. (line 484) +* substring: String Functions. (line 484) * Sumner, Andrew: Other Versions. (line 68) * supplementary groups of gawk process: Auto-set. (line 252) * switch statement: Switch Statement. (line 6) @@ -35356,8 +35358,8 @@ Index * timestamps, converting dates to: Time Functions. (line 78) * timestamps, formatted: Getlocaltime Function. (line 6) -* tolower: String Functions. (line 524) -* toupper: String Functions. (line 530) +* tolower: String Functions. (line 526) +* toupper: String Functions. (line 532) * tr utility: Translate Program. (line 6) * trace debugger command: Miscellaneous Debugger Commands. (line 110) @@ -35383,7 +35385,7 @@ Index * troubleshooting, gawk, fatal errors, function arguments: Calling Built-in. (line 16) * troubleshooting, getline function: File Checking. (line 25) -* troubleshooting, gsub()/sub() functions: String Functions. (line 473) +* troubleshooting, gsub()/sub() functions: String Functions. (line 475) * troubleshooting, match() function: String Functions. (line 291) * troubleshooting, print statement, omitting commas: Print Examples. (line 30) @@ -35393,7 +35395,7 @@ Index * troubleshooting, regexp constants vs. string constants: Computed Regexps. (line 40) * troubleshooting, string concatenation: Concatenation. (line 27) -* troubleshooting, substr() function: String Functions. (line 500) +* troubleshooting, substr() function: String Functions. (line 502) * troubleshooting, system() function: I/O Functions. (line 129) * troubleshooting, typographical errors, global variables: Options. (line 99) @@ -35790,336 +35792,336 @@ Ref: Numeric Functions-Footnote-1525480 Ref: Numeric Functions-Footnote-2525837 Ref: Numeric Functions-Footnote-3525885 Node: String Functions526157 -Ref: String Functions-Footnote-1549661 -Ref: String Functions-Footnote-2549789 -Ref: String Functions-Footnote-3550037 -Node: Gory Details550124 -Ref: table-sub-escapes551915 -Ref: table-sub-proposed553434 -Ref: table-posix-sub554797 -Ref: table-gensub-escapes556338 -Ref: Gory Details-Footnote-1557161 -Node: I/O Functions557315 -Ref: table-system-return-values563897 -Ref: I/O Functions-Footnote-1565877 -Ref: I/O Functions-Footnote-2566025 -Node: Time Functions566145 -Ref: Time Functions-Footnote-1576812 -Ref: Time Functions-Footnote-2576880 -Ref: Time Functions-Footnote-3577038 -Ref: Time Functions-Footnote-4577149 -Ref: Time Functions-Footnote-5577261 -Ref: Time Functions-Footnote-6577488 -Node: Bitwise Functions577754 -Ref: table-bitwise-ops578348 -Ref: Bitwise Functions-Footnote-1584381 -Ref: Bitwise Functions-Footnote-2584554 -Node: Type Functions584745 -Node: I18N Functions587420 -Node: User-defined589071 -Node: Definition Syntax589876 -Ref: Definition Syntax-Footnote-1595563 -Node: Function Example595634 -Ref: Function Example-Footnote-1598556 -Node: Function Caveats598578 -Node: Calling A Function599096 -Node: Variable Scope600054 -Node: Pass By Value/Reference603048 -Node: Return Statement606547 -Node: Dynamic Typing609526 -Node: Indirect Calls610456 -Ref: Indirect Calls-Footnote-1620707 -Node: Functions Summary620835 -Node: Library Functions623540 -Ref: Library Functions-Footnote-1627147 -Ref: Library Functions-Footnote-2627290 -Node: Library Names627461 -Ref: Library Names-Footnote-1630921 -Ref: Library Names-Footnote-2631144 -Node: General Functions631230 -Node: Strtonum Function632333 -Node: Assert Function635355 -Node: Round Function638681 -Node: Cliff Random Function640222 -Node: Ordinal Functions641238 -Ref: Ordinal Functions-Footnote-1644301 -Ref: Ordinal Functions-Footnote-2644553 -Node: Join Function644763 -Ref: Join Function-Footnote-1646533 -Node: Getlocaltime Function646733 -Node: Readfile Function650475 -Node: Shell Quoting652447 -Node: Data File Management653848 -Node: Filetrans Function654480 -Node: Rewind Function658576 -Node: File Checking660482 -Ref: File Checking-Footnote-1661816 -Node: Empty Files662017 -Node: Ignoring Assigns663996 -Node: Getopt Function665546 -Ref: Getopt Function-Footnote-1677015 -Node: Passwd Functions677215 -Ref: Passwd Functions-Footnote-1686054 -Node: Group Functions686142 -Ref: Group Functions-Footnote-1694040 -Node: Walking Arrays694247 -Node: Library Functions Summary697255 -Node: Library Exercises698661 -Node: Sample Programs699126 -Node: Running Examples699896 -Node: Clones700624 -Node: Cut Program701848 -Node: Egrep Program711777 -Ref: Egrep Program-Footnote-1719289 -Node: Id Program719399 -Node: Split Program723079 -Ref: Split Program-Footnote-1726538 -Node: Tee Program726667 -Node: Uniq Program729457 -Node: Wc Program736883 -Ref: Wc Program-Footnote-1741138 -Node: Miscellaneous Programs741232 -Node: Dupword Program742445 -Node: Alarm Program744475 -Node: Translate Program749330 -Ref: Translate Program-Footnote-1753895 -Node: Labels Program754165 -Ref: Labels Program-Footnote-1757516 -Node: Word Sorting757600 -Node: History Sorting761672 -Node: Extract Program763507 -Node: Simple Sed771036 -Node: Igawk Program774110 -Ref: Igawk Program-Footnote-1788441 -Ref: Igawk Program-Footnote-2788643 -Ref: Igawk Program-Footnote-3788765 -Node: Anagram Program788880 -Node: Signature Program791942 -Node: Programs Summary793189 -Node: Programs Exercises794403 -Ref: Programs Exercises-Footnote-1798532 -Node: Advanced Features798623 -Node: Nondecimal Data800613 -Node: Array Sorting802204 -Node: Controlling Array Traversal802904 -Ref: Controlling Array Traversal-Footnote-1811271 -Node: Array Sorting Functions811389 -Ref: Array Sorting Functions-Footnote-1816480 -Node: Two-way I/O816676 -Ref: Two-way I/O-Footnote-1823227 -Ref: Two-way I/O-Footnote-2823414 -Node: TCP/IP Networking823496 -Node: Profiling826614 -Ref: Profiling-Footnote-1835286 -Node: Advanced Features Summary835609 -Node: Internationalization837453 -Node: I18N and L10N838933 -Node: Explaining gettext839620 -Ref: Explaining gettext-Footnote-1845512 -Ref: Explaining gettext-Footnote-2845697 -Node: Programmer i18n845862 -Ref: Programmer i18n-Footnote-1850811 -Node: Translator i18n850860 -Node: String Extraction851654 -Ref: String Extraction-Footnote-1852786 -Node: Printf Ordering852872 -Ref: Printf Ordering-Footnote-1855658 -Node: I18N Portability855722 -Ref: I18N Portability-Footnote-1858178 -Node: I18N Example858241 -Ref: I18N Example-Footnote-1861047 -Node: Gawk I18N861120 -Node: I18N Summary861765 -Node: Debugger863106 -Node: Debugging864128 -Node: Debugging Concepts864569 -Node: Debugging Terms866378 -Node: Awk Debugging868953 -Node: Sample Debugging Session869859 -Node: Debugger Invocation870393 -Node: Finding The Bug871779 -Node: List of Debugger Commands878257 -Node: Breakpoint Control879590 -Node: Debugger Execution Control883284 -Node: Viewing And Changing Data886646 -Node: Execution Stack890020 -Node: Debugger Info891657 -Node: Miscellaneous Debugger Commands895728 -Node: Readline Support900816 -Node: Limitations901712 -Node: Debugging Summary903821 -Node: Arbitrary Precision Arithmetic905100 -Node: Computer Arithmetic906516 -Ref: table-numeric-ranges910107 -Ref: Computer Arithmetic-Footnote-1910829 -Node: Math Definitions910886 -Ref: table-ieee-formats914200 -Ref: Math Definitions-Footnote-1914803 -Node: MPFR features914908 -Node: FP Math Caution916625 -Ref: FP Math Caution-Footnote-1917697 -Node: Inexactness of computations918066 -Node: Inexact representation919026 -Node: Comparing FP Values920386 -Node: Errors accumulate921468 -Node: Getting Accuracy922901 -Node: Try To Round925611 -Node: Setting precision926510 -Ref: table-predefined-precision-strings927207 -Node: Setting the rounding mode929037 -Ref: table-gawk-rounding-modes929411 -Ref: Setting the rounding mode-Footnote-1932819 -Node: Arbitrary Precision Integers932998 -Ref: Arbitrary Precision Integers-Footnote-1937915 -Node: POSIX Floating Point Problems938064 -Ref: POSIX Floating Point Problems-Footnote-1941946 -Node: Floating point summary941984 -Node: Dynamic Extensions944174 -Node: Extension Intro945727 -Node: Plugin License946993 -Node: Extension Mechanism Outline947790 -Ref: figure-load-extension948229 -Ref: figure-register-new-function949794 -Ref: figure-call-new-function950886 -Node: Extension API Description952948 -Node: Extension API Functions Introduction954590 -Node: General Data Types959924 -Ref: General Data Types-Footnote-1967129 -Node: Memory Allocation Functions967428 -Ref: Memory Allocation Functions-Footnote-1970273 -Node: Constructor Functions970372 -Node: Registration Functions973371 -Node: Extension Functions974056 -Node: Exit Callback Functions979269 -Node: Extension Version String980519 -Node: Input Parsers981182 -Node: Output Wrappers993889 -Node: Two-way processors998401 -Node: Printing Messages1000666 -Ref: Printing Messages-Footnote-11001837 -Node: Updating ERRNO1001990 -Node: Requesting Values1002729 -Ref: table-value-types-returned1003466 -Node: Accessing Parameters1004402 -Node: Symbol Table Access1005637 -Node: Symbol table by name1006149 -Node: Symbol table by cookie1007938 -Ref: Symbol table by cookie-Footnote-11012123 -Node: Cached values1012187 -Ref: Cached values-Footnote-11015723 -Node: Array Manipulation1015814 -Ref: Array Manipulation-Footnote-11016905 -Node: Array Data Types1016942 -Ref: Array Data Types-Footnote-11019600 -Node: Array Functions1019692 -Node: Flattening Arrays1024091 -Node: Creating Arrays1031032 -Node: Redirection API1035801 -Node: Extension API Variables1038643 -Node: Extension Versioning1039276 -Ref: gawk-api-version1039713 -Node: Extension API Informational Variables1041441 -Node: Extension API Boilerplate1042505 -Node: Changes from API V11046367 -Node: Finding Extensions1047027 -Node: Extension Example1047586 -Node: Internal File Description1048384 -Node: Internal File Ops1052464 -Ref: Internal File Ops-Footnote-11063864 -Node: Using Internal File Ops1064004 -Ref: Using Internal File Ops-Footnote-11066387 -Node: Extension Samples1066661 -Node: Extension Sample File Functions1068190 -Node: Extension Sample Fnmatch1075839 -Node: Extension Sample Fork1077326 -Node: Extension Sample Inplace1078544 -Node: Extension Sample Ord1081754 -Node: Extension Sample Readdir1082590 -Ref: table-readdir-file-types1083479 -Node: Extension Sample Revout1084284 -Node: Extension Sample Rev2way1084873 -Node: Extension Sample Read write array1085613 -Node: Extension Sample Readfile1087555 -Node: Extension Sample Time1088650 -Node: Extension Sample API Tests1089998 -Node: gawkextlib1090490 -Node: Extension summary1092937 -Node: Extension Exercises1096639 -Node: Language History1098137 -Node: V7/SVR3.11099793 -Node: SVR41101945 -Node: POSIX1103379 -Node: BTL1104758 -Node: POSIX/GNU1105487 -Node: Feature History1111379 -Node: Common Extensions1125749 -Node: Ranges and Locales1127032 -Ref: Ranges and Locales-Footnote-11131648 -Ref: Ranges and Locales-Footnote-21131675 -Ref: Ranges and Locales-Footnote-31131910 -Node: Contributors1132131 -Node: History summary1137691 -Node: Installation1139071 -Node: Gawk Distribution1140015 -Node: Getting1140499 -Node: Extracting1141460 -Node: Distribution contents1143098 -Node: Unix Installation1149440 -Node: Quick Installation1150122 -Node: Shell Startup Files1152536 -Node: Additional Configuration Options1153625 -Node: Configuration Philosophy1155430 -Node: Non-Unix Installation1157799 -Node: PC Installation1158259 -Node: PC Binary Installation1159097 -Node: PC Compiling1159532 -Node: PC Using1160649 -Node: Cygwin1163694 -Node: MSYS1164464 -Node: VMS Installation1164965 -Node: VMS Compilation1165756 -Ref: VMS Compilation-Footnote-11166985 -Node: VMS Dynamic Extensions1167043 -Node: VMS Installation Details1168728 -Node: VMS Running1170981 -Node: VMS GNV1175260 -Node: VMS Old Gawk1175995 -Node: Bugs1176466 -Node: Bug address1177129 -Node: Usenet1179526 -Node: Maintainers1180303 -Node: Other Versions1181679 -Node: Installation summary1188263 -Node: Notes1189298 -Node: Compatibility Mode1190163 -Node: Additions1190945 -Node: Accessing The Source1191870 -Node: Adding Code1193305 -Node: New Ports1199523 -Node: Derived Files1204011 -Ref: Derived Files-Footnote-11209496 -Ref: Derived Files-Footnote-21209531 -Ref: Derived Files-Footnote-31210129 -Node: Future Extensions1210243 -Node: Implementation Limitations1210901 -Node: Extension Design1212084 -Node: Old Extension Problems1213238 -Ref: Old Extension Problems-Footnote-11214756 -Node: Extension New Mechanism Goals1214813 -Ref: Extension New Mechanism Goals-Footnote-11218177 -Node: Extension Other Design Decisions1218366 -Node: Extension Future Growth1220479 -Node: Old Extension Mechanism1221315 -Node: Notes summary1223078 -Node: Basic Concepts1224260 -Node: Basic High Level1224941 -Ref: figure-general-flow1225223 -Ref: figure-process-flow1225908 -Ref: Basic High Level-Footnote-11229209 -Node: Basic Data Typing1229394 -Node: Glossary1232722 -Node: Copying1264669 -Node: GNU Free Documentation License1302208 -Node: Index1327326 +Ref: String Functions-Footnote-1549815 +Ref: String Functions-Footnote-2549943 +Ref: String Functions-Footnote-3550191 +Node: Gory Details550278 +Ref: table-sub-escapes552069 +Ref: table-sub-proposed553588 +Ref: table-posix-sub554951 +Ref: table-gensub-escapes556492 +Ref: Gory Details-Footnote-1557315 +Node: I/O Functions557469 +Ref: table-system-return-values564051 +Ref: I/O Functions-Footnote-1566031 +Ref: I/O Functions-Footnote-2566179 +Node: Time Functions566299 +Ref: Time Functions-Footnote-1576966 +Ref: Time Functions-Footnote-2577034 +Ref: Time Functions-Footnote-3577192 +Ref: Time Functions-Footnote-4577303 +Ref: Time Functions-Footnote-5577415 +Ref: Time Functions-Footnote-6577642 +Node: Bitwise Functions577908 +Ref: table-bitwise-ops578502 +Ref: Bitwise Functions-Footnote-1584535 +Ref: Bitwise Functions-Footnote-2584708 +Node: Type Functions584899 +Node: I18N Functions587574 +Node: User-defined589225 +Node: Definition Syntax590030 +Ref: Definition Syntax-Footnote-1595717 +Node: Function Example595788 +Ref: Function Example-Footnote-1598710 +Node: Function Caveats598732 +Node: Calling A Function599250 +Node: Variable Scope600208 +Node: Pass By Value/Reference603202 +Node: Return Statement606701 +Node: Dynamic Typing609680 +Node: Indirect Calls610610 +Ref: Indirect Calls-Footnote-1620861 +Node: Functions Summary620989 +Node: Library Functions623694 +Ref: Library Functions-Footnote-1627301 +Ref: Library Functions-Footnote-2627444 +Node: Library Names627615 +Ref: Library Names-Footnote-1631075 +Ref: Library Names-Footnote-2631298 +Node: General Functions631384 +Node: Strtonum Function632487 +Node: Assert Function635509 +Node: Round Function638835 +Node: Cliff Random Function640376 +Node: Ordinal Functions641392 +Ref: Ordinal Functions-Footnote-1644455 +Ref: Ordinal Functions-Footnote-2644707 +Node: Join Function644917 +Ref: Join Function-Footnote-1646687 +Node: Getlocaltime Function646887 +Node: Readfile Function650629 +Node: Shell Quoting652601 +Node: Data File Management654002 +Node: Filetrans Function654634 +Node: Rewind Function658730 +Node: File Checking660636 +Ref: File Checking-Footnote-1661970 +Node: Empty Files662171 +Node: Ignoring Assigns664150 +Node: Getopt Function665700 +Ref: Getopt Function-Footnote-1677169 +Node: Passwd Functions677369 +Ref: Passwd Functions-Footnote-1686208 +Node: Group Functions686296 +Ref: Group Functions-Footnote-1694194 +Node: Walking Arrays694401 +Node: Library Functions Summary697409 +Node: Library Exercises698815 +Node: Sample Programs699280 +Node: Running Examples700050 +Node: Clones700778 +Node: Cut Program702002 +Node: Egrep Program711931 +Ref: Egrep Program-Footnote-1719443 +Node: Id Program719553 +Node: Split Program723233 +Ref: Split Program-Footnote-1726692 +Node: Tee Program726821 +Node: Uniq Program729611 +Node: Wc Program737037 +Ref: Wc Program-Footnote-1741292 +Node: Miscellaneous Programs741386 +Node: Dupword Program742599 +Node: Alarm Program744629 +Node: Translate Program749484 +Ref: Translate Program-Footnote-1754049 +Node: Labels Program754319 +Ref: Labels Program-Footnote-1757670 +Node: Word Sorting757754 +Node: History Sorting761826 +Node: Extract Program763661 +Node: Simple Sed771190 +Node: Igawk Program774264 +Ref: Igawk Program-Footnote-1788595 +Ref: Igawk Program-Footnote-2788797 +Ref: Igawk Program-Footnote-3788919 +Node: Anagram Program789034 +Node: Signature Program792096 +Node: Programs Summary793343 +Node: Programs Exercises794557 +Ref: Programs Exercises-Footnote-1798686 +Node: Advanced Features798777 +Node: Nondecimal Data800767 +Node: Array Sorting802358 +Node: Controlling Array Traversal803058 +Ref: Controlling Array Traversal-Footnote-1811425 +Node: Array Sorting Functions811543 +Ref: Array Sorting Functions-Footnote-1816634 +Node: Two-way I/O816830 +Ref: Two-way I/O-Footnote-1823381 +Ref: Two-way I/O-Footnote-2823568 +Node: TCP/IP Networking823650 +Node: Profiling826768 +Ref: Profiling-Footnote-1835440 +Node: Advanced Features Summary835763 +Node: Internationalization837607 +Node: I18N and L10N839087 +Node: Explaining gettext839774 +Ref: Explaining gettext-Footnote-1845666 +Ref: Explaining gettext-Footnote-2845851 +Node: Programmer i18n846016 +Ref: Programmer i18n-Footnote-1850965 +Node: Translator i18n851014 +Node: String Extraction851808 +Ref: String Extraction-Footnote-1852940 +Node: Printf Ordering853026 +Ref: Printf Ordering-Footnote-1855812 +Node: I18N Portability855876 +Ref: I18N Portability-Footnote-1858332 +Node: I18N Example858395 +Ref: I18N Example-Footnote-1861201 +Node: Gawk I18N861274 +Node: I18N Summary861919 +Node: Debugger863260 +Node: Debugging864282 +Node: Debugging Concepts864723 +Node: Debugging Terms866532 +Node: Awk Debugging869107 +Node: Sample Debugging Session870013 +Node: Debugger Invocation870547 +Node: Finding The Bug871933 +Node: List of Debugger Commands878411 +Node: Breakpoint Control879744 +Node: Debugger Execution Control883438 +Node: Viewing And Changing Data886800 +Node: Execution Stack890174 +Node: Debugger Info891811 +Node: Miscellaneous Debugger Commands895882 +Node: Readline Support900970 +Node: Limitations901866 +Node: Debugging Summary903975 +Node: Arbitrary Precision Arithmetic905254 +Node: Computer Arithmetic906670 +Ref: table-numeric-ranges910261 +Ref: Computer Arithmetic-Footnote-1910983 +Node: Math Definitions911040 +Ref: table-ieee-formats914354 +Ref: Math Definitions-Footnote-1914957 +Node: MPFR features915062 +Node: FP Math Caution916779 +Ref: FP Math Caution-Footnote-1917851 +Node: Inexactness of computations918220 +Node: Inexact representation919180 +Node: Comparing FP Values920540 +Node: Errors accumulate921622 +Node: Getting Accuracy923055 +Node: Try To Round925765 +Node: Setting precision926664 +Ref: table-predefined-precision-strings927361 +Node: Setting the rounding mode929191 +Ref: table-gawk-rounding-modes929565 +Ref: Setting the rounding mode-Footnote-1932973 +Node: Arbitrary Precision Integers933152 +Ref: Arbitrary Precision Integers-Footnote-1938069 +Node: POSIX Floating Point Problems938218 +Ref: POSIX Floating Point Problems-Footnote-1942100 +Node: Floating point summary942138 +Node: Dynamic Extensions944328 +Node: Extension Intro945881 +Node: Plugin License947147 +Node: Extension Mechanism Outline947944 +Ref: figure-load-extension948383 +Ref: figure-register-new-function949948 +Ref: figure-call-new-function951040 +Node: Extension API Description953102 +Node: Extension API Functions Introduction954744 +Node: General Data Types960078 +Ref: General Data Types-Footnote-1967283 +Node: Memory Allocation Functions967582 +Ref: Memory Allocation Functions-Footnote-1970427 +Node: Constructor Functions970526 +Node: Registration Functions973525 +Node: Extension Functions974210 +Node: Exit Callback Functions979423 +Node: Extension Version String980673 +Node: Input Parsers981336 +Node: Output Wrappers994043 +Node: Two-way processors998555 +Node: Printing Messages1000820 +Ref: Printing Messages-Footnote-11001991 +Node: Updating ERRNO1002144 +Node: Requesting Values1002883 +Ref: table-value-types-returned1003620 +Node: Accessing Parameters1004556 +Node: Symbol Table Access1005791 +Node: Symbol table by name1006303 +Node: Symbol table by cookie1008092 +Ref: Symbol table by cookie-Footnote-11012277 +Node: Cached values1012341 +Ref: Cached values-Footnote-11015877 +Node: Array Manipulation1015968 +Ref: Array Manipulation-Footnote-11017059 +Node: Array Data Types1017096 +Ref: Array Data Types-Footnote-11019754 +Node: Array Functions1019846 +Node: Flattening Arrays1024245 +Node: Creating Arrays1031186 +Node: Redirection API1035955 +Node: Extension API Variables1038797 +Node: Extension Versioning1039430 +Ref: gawk-api-version1039867 +Node: Extension API Informational Variables1041595 +Node: Extension API Boilerplate1042659 +Node: Changes from API V11046521 +Node: Finding Extensions1047181 +Node: Extension Example1047740 +Node: Internal File Description1048538 +Node: Internal File Ops1052618 +Ref: Internal File Ops-Footnote-11064018 +Node: Using Internal File Ops1064158 +Ref: Using Internal File Ops-Footnote-11066541 +Node: Extension Samples1066815 +Node: Extension Sample File Functions1068344 +Node: Extension Sample Fnmatch1075993 +Node: Extension Sample Fork1077480 +Node: Extension Sample Inplace1078698 +Node: Extension Sample Ord1081908 +Node: Extension Sample Readdir1082744 +Ref: table-readdir-file-types1083633 +Node: Extension Sample Revout1084438 +Node: Extension Sample Rev2way1085027 +Node: Extension Sample Read write array1085767 +Node: Extension Sample Readfile1087709 +Node: Extension Sample Time1088804 +Node: Extension Sample API Tests1090152 +Node: gawkextlib1090644 +Node: Extension summary1093091 +Node: Extension Exercises1096793 +Node: Language History1098291 +Node: V7/SVR3.11099947 +Node: SVR41102099 +Node: POSIX1103533 +Node: BTL1104912 +Node: POSIX/GNU1105641 +Node: Feature History1111533 +Node: Common Extensions1125903 +Node: Ranges and Locales1127186 +Ref: Ranges and Locales-Footnote-11131802 +Ref: Ranges and Locales-Footnote-21131829 +Ref: Ranges and Locales-Footnote-31132064 +Node: Contributors1132285 +Node: History summary1137845 +Node: Installation1139225 +Node: Gawk Distribution1140169 +Node: Getting1140653 +Node: Extracting1141614 +Node: Distribution contents1143252 +Node: Unix Installation1149594 +Node: Quick Installation1150276 +Node: Shell Startup Files1152690 +Node: Additional Configuration Options1153779 +Node: Configuration Philosophy1155584 +Node: Non-Unix Installation1157953 +Node: PC Installation1158413 +Node: PC Binary Installation1159251 +Node: PC Compiling1159686 +Node: PC Using1160803 +Node: Cygwin1163848 +Node: MSYS1164618 +Node: VMS Installation1165119 +Node: VMS Compilation1165910 +Ref: VMS Compilation-Footnote-11167139 +Node: VMS Dynamic Extensions1167197 +Node: VMS Installation Details1168882 +Node: VMS Running1171135 +Node: VMS GNV1175414 +Node: VMS Old Gawk1176149 +Node: Bugs1176620 +Node: Bug address1177283 +Node: Usenet1179680 +Node: Maintainers1180457 +Node: Other Versions1181833 +Node: Installation summary1188417 +Node: Notes1189452 +Node: Compatibility Mode1190317 +Node: Additions1191099 +Node: Accessing The Source1192024 +Node: Adding Code1193459 +Node: New Ports1199677 +Node: Derived Files1204165 +Ref: Derived Files-Footnote-11209650 +Ref: Derived Files-Footnote-21209685 +Ref: Derived Files-Footnote-31210283 +Node: Future Extensions1210397 +Node: Implementation Limitations1211055 +Node: Extension Design1212238 +Node: Old Extension Problems1213392 +Ref: Old Extension Problems-Footnote-11214910 +Node: Extension New Mechanism Goals1214967 +Ref: Extension New Mechanism Goals-Footnote-11218331 +Node: Extension Other Design Decisions1218520 +Node: Extension Future Growth1220633 +Node: Old Extension Mechanism1221469 +Node: Notes summary1223232 +Node: Basic Concepts1224414 +Node: Basic High Level1225095 +Ref: figure-general-flow1225377 +Ref: figure-process-flow1226062 +Ref: Basic High Level-Footnote-11229363 +Node: Basic Data Typing1229548 +Node: Glossary1232876 +Node: Copying1264823 +Node: GNU Free Documentation License1302362 +Node: Index1327480 End Tag Table diff --git a/doc/gawk.texi b/doc/gawk.texi index 0e376104..8b872e9d 100644 --- a/doc/gawk.texi +++ b/doc/gawk.texi @@ -17994,7 +17994,7 @@ using a third argument is a fatal error. @cindexgawkfunc{patsplit} @cindex split string into array Divide -@var{string} into pieces defined by @var{fieldpat} +@var{string} into pieces (or ``fields'') defined by @var{fieldpat} and store the pieces in @var{array} and the separator strings in the @var{seps} array. The first piece is stored in @code{@var{array}[1]}, the second piece in @code{@var{array}[2]}, and so @@ -18005,9 +18005,11 @@ It may be either a regexp constant or a string. If @var{fieldpat} is omitted, the value of @code{FPAT} is used. @code{patsplit()} returns the number of elements created. @code{@var{seps}[@var{i}]} is -the separator string -between @code{@var{array}[@var{i}]} and @code{@var{array}[@var{i}+1]}. -Any leading separator will be in @code{@var{seps}[0]}. +the possibly null separator string +after @code{@var{array}[@var{i}]}. +The possibly null leading separator will be in @code{@var{seps}[0]}. +So a non-null @var{string} with @var{n} fields will have @var{n+1} separators. +A null @var{string} will not have neither fields nor separators. The @code{patsplit()} function splits strings into pieces in a manner similar to the way input lines are split into fields using @code{FPAT} diff --git a/doc/gawktexi.in b/doc/gawktexi.in index f4fe2596..f991432c 100644 --- a/doc/gawktexi.in +++ b/doc/gawktexi.in @@ -17267,7 +17267,7 @@ using a third argument is a fatal error. @cindexgawkfunc{patsplit} @cindex split string into array Divide -@var{string} into pieces defined by @var{fieldpat} +@var{string} into pieces (or ``fields'') defined by @var{fieldpat} and store the pieces in @var{array} and the separator strings in the @var{seps} array. The first piece is stored in @code{@var{array}[1]}, the second piece in @code{@var{array}[2]}, and so @@ -17278,9 +17278,11 @@ It may be either a regexp constant or a string. If @var{fieldpat} is omitted, the value of @code{FPAT} is used. @code{patsplit()} returns the number of elements created. @code{@var{seps}[@var{i}]} is -the separator string -between @code{@var{array}[@var{i}]} and @code{@var{array}[@var{i}+1]}. -Any leading separator will be in @code{@var{seps}[0]}. +the possibly null separator string +after @code{@var{array}[@var{i}]}. +The possibly null leading separator will be in @code{@var{seps}[0]}. +So a non-null @var{string} with @var{n} fields will have @var{n+1} separators. +A null @var{string} will not have neither fields nor separators. The @code{patsplit()} function splits strings into pieces in a manner similar to the way input lines are split into fields using @code{FPAT} @@ -1502,101 +1502,65 @@ incr_scan(char **scanp, size_t len, mbstate_t *mbs) * via (*parse_field)(). This variation is for when FPAT is a regular * expression -- use the value to find field contents. * - * This was really hard to get right. It happens to bear many resemblances - * to issues I had with getting gsub right with null matches. When dealing - * with that I prototyped in awk and had the foresight to save the awk code - * over in the C file. Starting with that as a base, I finally got to this - * awk code to do what I needed, and then translated it into C. Fortunately - * the C code bears a closer correspondance to the awk code here than over - * by gsub. + * The FPAT parsing logic is a bit difficult to specify. In particular + * to allow null fields at certain locations. To make the code as robust + * as possible, an awk reference implementation was written and tested + * as a first step, and later recoded in C, preserving its structure as + * much as possible. * - * BEGIN { - * false = 0 - * true = 1 - * - * fpat[1] = "([^,]*)|(\"[^\"]+\")" - * fpat[2] = fpat[1] - * fpat[3] = fpat[1] - * fpat[4] = "aa+" - * fpat[5] = fpat[4] - * - * data[1] = "Robbins,,Arnold," - * data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA" - * data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA" - * data[4] = "bbbaaacccdddaaaaaqqqq" - * data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa - * - * for (i = 1; i in data; i++) { - * printf("Splitting: <%s>\n", data[i]) - * n = mypatsplit(data[i], fields, fpat[i], seps) - * print "n =", n - * for (j = 1; j <= n; j++) - * printf("fields[%d] = <%s>\n", j, fields[j]) - * for (j = 0; j in seps; j++) - * printf("seps[%s] = <%s>\n", j, seps[j]) - * } - * } - * - * function mypatsplit(string, array, pattern, seps, - * eosflag, non_empty, nf) # locals + * # Reference implementation of the FPAT record parsing. + * # + * # Each loop iteration identifies a (separator[n-1],field[n]) pair. + * # Each loop iteration must consume some characters, except for the first field. + * # So a null field is only valid as a first field or after a non-null separator. + * # A null record has no fields (not a single null field). + * + * function refpatsplit(string, fields, pattern, seps, + * parse_start, sep_start, field_start, field_length, field_found, nf) # locals * { - * delete array - * delete seps - * if (length(string) == 0) - * return 0 - * - * eosflag = non_empty = false - * nf = 0 - * while (match(string, pattern)) { - * if (RLENGTH > 0) { # easy case - * non_empty = true - * if (! (nf in seps)) { - * if (RSTART == 1) # match at front of string - * seps[nf] = "" - * else - * seps[nf] = substr(string, 1, RSTART - 1) - * } - * array[++nf] = substr(string, RSTART, RLENGTH) - * string = substr(string, RSTART+RLENGTH) - * if (length(string) == 0) - * break - * } else if (non_empty) { - * # last match was non-empty, and at the - * # current character we get a zero length match, - * # which we don't want, so skip over it - * non_empty = false - * seps[nf] = substr(string, 1, 1) - * string = substr(string, 2) - * } else { - * # 0 length match - * if (! (nf in seps)) { - * if (RSTART == 1) - * seps[nf] = "" - * else - * seps[nf] = substr(string, 1, RSTART - 1) - * } - * array[++nf] = "" - * if (! non_empty && ! eosflag) { # prev was empty - * seps[nf] = substr(string, 1, 1) - * } - * if (RSTART == 1) { - * string = substr(string, 2) - * } else { - * string = substr(string, RSTART + 1) - * } - * non_empty = false - * } - * if (length(string) == 0) { - * if (eosflag) - * break - * else - * eosflag = true - * } - * } - * if (length(string) > 0) - * seps[nf] = string - * - * return length(array) + * # Local state variables: + * # - parse_start: pointer to the first not yet consumed character + * # - sep_start: pointer to the beginning of the parsed separator + * # - field start: pointer to the beginning of the parsed field + * # - field length: length of the parsed field + * # - field_found: flag for succesful field match + * # - nf: Number of fields found so far + * + * # Prepare for parsing + * parse_start = 1 # first not yet parsed char + * nf = 0 # fields found so far + * delete fields + * delete seps + * + * # Loop that consumes the whole record + * while (parse_start <= length(string)) { # still something to parse + * + * # first attempt to match the next field + * sep_start = parse_start + * field_found = match(substr(string, parse_start), pattern) + * + * # check for an invalid null field and retry one character away + * if (nf > 0 && field_found && RSTART==1 && RLENGTH==0) { + * parse_start++ + * field_found = match(substr(string, parse_start), pattern) + * } + * + * # store the (sep[n-1],field[n]) pair + * if (field_found) { + * field_start = parse_start + RSTART - 1 + * field_length = RLENGTH + * seps[nf] = substr(string, sep_start, field_start-sep_start) + * fields[++nf] = substr(string, field_start, field_length) + * parse_start = field_start + field_length + * + * # store the final extra sep after the last field + * } else { + * seps[nf] = substr(string, sep_start) + * parse_start = length(string) + 1 + * } + * } + * + * return nf * } */ static long @@ -1615,10 +1579,9 @@ fpat_parse_field(long up_to, /* parse only up to this field number */ char *start; char *end = scan + len; int regex_flags = RE_NEED_START; - bool need_to_set_sep; - bool non_empty; - bool eosflag; mbstate_t mbs; + char* field_start; + bool field_found; memset(&mbs, 0, sizeof(mbstate_t)); @@ -1631,90 +1594,48 @@ fpat_parse_field(long up_to, /* parse only up to this field number */ if (rp == NULL) /* use FPAT */ rp = FPAT_regexp; - if (in_middle) { - regex_flags |= RE_NO_BOL; - } - non_empty = rp->non_empty; + while (scan <= end && nf < up_to) { /* still something to parse */ - eosflag = false; - need_to_set_sep = true; - start = scan; - while (research(rp, scan, 0, (end - scan), regex_flags) != -1 - && nf < up_to) { + /* first attempt to match the next field */ + start = scan; + field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1; + + /* check for an invalid null field and retry one character away */ + if (nf > 0 && field_found && REEND(rp, scan) == 0) { /* invalid null field */ + increment_scan(& scan, end - scan); + field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1; + } - if (REEND(rp, scan) > RESTART(rp, scan)) { /* if (RLENGTH > 0) */ - non_empty = true; - if (sep_arr != NULL && need_to_set_sep) { - if (RESTART(rp, scan) == 0) /* match at front */ - set_element(nf, start, 0L, sep_arr); + /* store the (sep[n-1],field[n]) pair */ + if (field_found) { + field_start = scan + RESTART(rp, scan); + if (sep_arr != NULL) { /* store the separator */ + if (field_start == start) /* match at front */ + set_element(nf, start, 0L, sep_arr); else - set_element(nf, + set_element(nf, start, - (long) RESTART(rp, scan), + (long) (field_start - start), sep_arr); } /* field is text that matched */ (*set)(++nf, - scan + RESTART(rp, scan), + field_start, (long)(REEND(rp, scan) - RESTART(rp, scan)), n); - scan += REEND(rp, scan); - if (scan >= end) - break; - need_to_set_sep = true; - } else if (non_empty) { /* else if non_empty */ + + } else { /* - * last match was non-empty, and at the - * current character we get a zero length match, - * which we don't want, so skip over it + * No match, store the final extra separator after + * the last field. */ - non_empty = false; - if (sep_arr != NULL) { - need_to_set_sep = false; - set_element(nf, start, 1L, sep_arr); - } - increment_scan(& scan, end - scan); - } else { - /* 0 length match */ - if (sep_arr != NULL && need_to_set_sep) { - if (RESTART(rp, scan) == 0) /* RSTART == 1 */ - set_element(nf, start, 0L, sep_arr); - else - set_element(nf, start, - (long) RESTART(rp, scan), - sep_arr); - } - need_to_set_sep = true; - (*set)(++nf, scan, 0L, n); - if (! non_empty && ! eosflag) { /* prev was empty */ - if (sep_arr != NULL) { - set_element(nf, start, 1L, sep_arr); - need_to_set_sep = false; - } - } - if (RESTART(rp, scan) == 0) - increment_scan(& scan, end - scan); - else { - scan += RESTART(rp, scan); - } - non_empty = false; - } - if (scan >= end) { /* length(string) == 0 */ - if (eosflag) - break; - else - eosflag = true; + if (sep_arr != NULL) + set_element(nf, start, (long) (end - start), sep_arr); + scan = end + 1; } - - start = scan; - } - if (scan < end) { - if (sep_arr != NULL) - set_element(nf, scan, (long) (end - scan), sep_arr); } *buf = scan; - rp->non_empty = non_empty; return nf; } diff --git a/test/ChangeLog b/test/ChangeLog index dd6b4cfb..ecac0519 100644 --- a/test/ChangeLog +++ b/test/ChangeLog @@ -1,3 +1,10 @@ +2017-04-12 Manuel Collado <m-collado@users.sourceforge.net> + + * Makefile.am (fpat6): New test. + * fpat6.awk, fpat6.in, fpat6.ok: New files. + Check for the bug reported by Ed Morton in the bug-gawk mailing list. + * patsplit.ok: Updated to the new patsplit behavior. + 2017-04-12 Arnold D. Robbins <arnold@skeeve.com> * Makefile.am (memleak): New test. diff --git a/test/Makefile.am b/test/Makefile.am index fe9b1dcc..7b1b4946 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -339,6 +339,9 @@ EXTRA_DIST = \ fpat5.awk \ fpat5.in \ fpat5.ok \ + fpat6.awk \ + fpat6.in \ + fpat6.ok \ fpatnull.awk \ fpatnull.in \ fpatnull.ok \ @@ -1227,8 +1230,8 @@ GAWK_EXT_TESTS = \ colonwarn clos1way clos1way2 clos1way3 clos1way4 clos1way5 clos1way6 \ crlf dbugeval dbugeval2 dbugtypedre1 dbugtypedre2 delsub \ devfd devfd1 devfd2 dumpvars errno exit \ - fieldwdth forcenum fpat1 fpat2 fpat3 fpat4 fpat5 fpatnull fsfwfs funlen \ - functab1 functab2 functab3 fwtest fwtest2 fwtest3 fwtest4 \ + fieldwdth forcenum fpat1 fpat2 fpat3 fpat4 fpat5 fpat6 fpatnull \ + fsfwfs funlen functab1 functab2 functab3 fwtest fwtest2 fwtest3 fwtest4 \ genpot gensub gensub2 gensub3 getlndir gnuops2 gnuops3 gnureops gsubind \ icasefs icasers id igncdym igncfs ignrcas2 ignrcas4 ignrcase \ incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \ diff --git a/test/Makefile.in b/test/Makefile.in index e6293e8d..bdfbdc82 100644 --- a/test/Makefile.in +++ b/test/Makefile.in @@ -597,6 +597,9 @@ EXTRA_DIST = \ fpat5.awk \ fpat5.in \ fpat5.ok \ + fpat6.awk \ + fpat6.in \ + fpat6.ok \ fpatnull.awk \ fpatnull.in \ fpatnull.ok \ @@ -1484,8 +1487,8 @@ GAWK_EXT_TESTS = \ colonwarn clos1way clos1way2 clos1way3 clos1way4 clos1way5 clos1way6 \ crlf dbugeval dbugeval2 dbugtypedre1 dbugtypedre2 delsub \ devfd devfd1 devfd2 dumpvars errno exit \ - fieldwdth forcenum fpat1 fpat2 fpat3 fpat4 fpat5 fpatnull fsfwfs funlen \ - functab1 functab2 functab3 fwtest fwtest2 fwtest3 fwtest4 \ + fieldwdth forcenum fpat1 fpat2 fpat3 fpat4 fpat5 fpat6 fpatnull \ + fsfwfs funlen functab1 functab2 functab3 fwtest fwtest2 fwtest3 fwtest4 \ genpot gensub gensub2 gensub3 getlndir gnuops2 gnuops3 gnureops gsubind \ icasefs icasers id igncdym igncfs ignrcas2 ignrcas4 ignrcase \ incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \ @@ -3966,6 +3969,11 @@ fpat5: @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +fpat6: + @echo $@ + @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + fpatnull: @echo $@ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ diff --git a/test/Maketests b/test/Maketests index 4b765c9f..0c77f98a 100644 --- a/test/Maketests +++ b/test/Maketests @@ -1142,6 +1142,11 @@ fpat5: @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +fpat6: + @echo $@ + @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + fpatnull: @echo $@ @AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ diff --git a/test/fpat6.awk b/test/fpat6.awk new file mode 100644 index 00000000..de7824a4 --- /dev/null +++ b/test/fpat6.awk @@ -0,0 +1,8 @@ +BEGIN { + FPAT = "([^,]*)|(\"[^\"]+\")" +} +{ + print "NF =", NF + for (i = 1; i <= NF; i++) + printf("$%d = <%s>\n", i, $i) +} diff --git a/test/fpat6.in b/test/fpat6.in new file mode 100644 index 00000000..1924cd97 --- /dev/null +++ b/test/fpat6.in @@ -0,0 +1,13 @@ +,,3 +,,3 +,,a,,b,, +,,a,,b,, +"a",,"b" + + +"" +"" +xx +xx +, +, diff --git a/test/fpat6.ok b/test/fpat6.ok new file mode 100644 index 00000000..f9c393a1 --- /dev/null +++ b/test/fpat6.ok @@ -0,0 +1,44 @@ +NF = 3 +$1 = <> +$2 = <> +$3 = <3> +NF = 3 +$1 = <> +$2 = <> +$3 = <3> +NF = 7 +$1 = <> +$2 = <> +$3 = <a> +$4 = <> +$5 = <b> +$6 = <> +$7 = <> +NF = 7 +$1 = <> +$2 = <> +$3 = <a> +$4 = <> +$5 = <b> +$6 = <> +$7 = <> +NF = 3 +$1 = <"a"> +$2 = <> +$3 = <"b"> +NF = 0 +NF = 0 +NF = 1 +$1 = <""> +NF = 1 +$1 = <""> +NF = 1 +$1 = <xx> +NF = 1 +$1 = <xx> +NF = 2 +$1 = <> +$2 = <> +NF = 2 +$1 = <> +$2 = <> diff --git a/test/patsplit.ok b/test/patsplit.ok index cda8319e..02387d86 100644 --- a/test/patsplit.ok +++ b/test/patsplit.ok @@ -8,6 +8,7 @@ seps[0] = <> seps[1] = <,> seps[2] = <,> seps[3] = <,> +seps[4] = <> Splitting: <Smith,,"1234 A Pretty Place, NE",Sometown,NY,12345-6789,USA> n = 7 fields[1] = <Smith> @@ -24,6 +25,7 @@ seps[3] = <,> seps[4] = <,> seps[5] = <,> seps[6] = <,> +seps[7] = <> Splitting: <Robbins,Arnold,"1234 A Pretty Place, NE",Sometown,NY,12345-6789,USA> n = 7 fields[1] = <Robbins> @@ -40,6 +42,7 @@ seps[3] = <,> seps[4] = <,> seps[5] = <,> seps[6] = <,> +seps[7] = <> Splitting: <bbbaaacccdddaaaaaqqqq> n = 2 fields[1] = <aaa> |