diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2015-08-25 19:51:54 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2015-08-25 19:51:54 +0300 |
commit | 278fe876bb18938803ac1c36b028adb8cef6fe84 (patch) | |
tree | 289302aa264df2025b48f07fbffca7b70bd01f83 | |
parent | 96cc85ac9ba06ab6b9edface5e4c34392a07a98d (diff) | |
download | egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.gz egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.bz2 egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.zip |
Improve handling of invalid data in UTF locales.
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | node.c | 38 | ||||
-rw-r--r-- | test/ChangeLog | 6 | ||||
-rw-r--r-- | test/Makefile.am | 11 | ||||
-rw-r--r-- | test/Makefile.in | 11 | ||||
-rw-r--r-- | test/mbstr1.ok | 1 | ||||
-rw-r--r-- | test/mbstr2.awk | 1 | ||||
-rw-r--r-- | test/mbstr2.in | 4 | ||||
-rw-r--r-- | test/mbstr2.ok | 5 |
9 files changed, 72 insertions, 13 deletions
@@ -1,3 +1,11 @@ +2015-08-25 Arnold D. Robbins <arnold@skeeve.com> + + * node.c (str2wstr): Upon finding an invalid character, if + using UTF-8, use the replacement character instead of skipping + it. Helps match() and other functions work better in the face + of unexpected data. Make the lint warning an unconditional + warning. + 2015-08-15 Arnold D. Robbins <arnold@skeeve.com> * dfa.c (dfamust): Restore c90 compat by moving some @@ -717,22 +717,37 @@ str2wstr(NODE *n, size_t **ptr) case (size_t) -2: case (size_t) -1: /* - * Just skip the bad byte and keep going, so that - * we get a more-or-less full string, instead of - * stopping early. This is particularly important - * for match() where we need to build the indices. - */ - sp++; - src_count--; - /* * mbrtowc(3) says the state of mbs becomes undefined * after a bad character, so reset it. */ memset(& mbs, 0, sizeof(mbs)); - /* And warn the user something's wrong */ - if (do_lint && ! warned) { + + /* Warn the user something's wrong */ + if (! warned) { warned = true; - lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale.")); + warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale.")); + } + + /* + * 8/2015: If we're using UTF, then instead of just + * skipping the character, plug in the Unicode + * replacement character. In most cases this gives + * us "better" results, in that character counts + * and string lengths tend to make more sense. + * + * Otherwise, just skip the bad byte and keep going, + * so that we get a more-or-less full string, instead of + * stopping early. This is particularly important + * for match() where we need to build the indices. + */ + if (using_utf8()) { + count = 1; + wc = 0xFFFD; /* unicode replacement character */ + goto set_wc; + } else { + /* skip it and keep going */ + sp++; + src_count--; } break; @@ -740,6 +755,7 @@ str2wstr(NODE *n, size_t **ptr) count = 1; /* fall through */ default: + set_wc: *wsp++ = wc; src_count -= count; while (count--) { diff --git a/test/ChangeLog b/test/ChangeLog index cc7576ef..7b9e273a 100644 --- a/test/ChangeLog +++ b/test/ChangeLog @@ -1,3 +1,9 @@ +2015-08-25 Arnold D. Robbins <arnold@skeeve.com> + + * mbstr1.ok: Updated after code change. + * Makefile.am (mbstr2): New test. + * mbstr2.awk, mbstr2.in, mbstr2.ok: New files. + 2015-06-25 Arnold D. Robbins <arnold@skeeve.com> * Makefile.am (negtime): Fix out-of-tree test run. diff --git a/test/Makefile.am b/test/Makefile.am index 5e72014b..14ebf544 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -531,6 +531,9 @@ EXTRA_DIST = \ mbprintf4.ok \ mbstr1.awk \ mbstr1.ok \ + mbstr2.awk \ + mbstr2.in \ + mbstr2.ok \ membug1.awk \ membug1.in \ membug1.ok \ @@ -1062,7 +1065,7 @@ GAWK_EXT_TESTS = \ incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \ include include2 indirectbuiltin indirectcall indirectcall2 \ lint lintold lintwarn \ - manyfiles match1 match2 match3 mbstr1 \ + manyfiles match1 match2 match3 mbstr1 mbstr2 \ nastyparm negtime next nondec nondec2 \ patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \ profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 pty1 \ @@ -1705,6 +1708,12 @@ mbstr1:: AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +mbstr2:: + @echo $@ + @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \ + AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + printfbad2: printfbad2.ok @echo $@ @$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@ diff --git a/test/Makefile.in b/test/Makefile.in index 0116f3a4..a78b3e6a 100644 --- a/test/Makefile.in +++ b/test/Makefile.in @@ -788,6 +788,9 @@ EXTRA_DIST = \ mbprintf4.ok \ mbstr1.awk \ mbstr1.ok \ + mbstr2.awk \ + mbstr2.in \ + mbstr2.ok \ membug1.awk \ membug1.in \ membug1.ok \ @@ -1318,7 +1321,7 @@ GAWK_EXT_TESTS = \ incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \ include include2 indirectbuiltin indirectcall indirectcall2 \ lint lintold lintwarn \ - manyfiles match1 match2 match3 mbstr1 \ + manyfiles match1 match2 match3 mbstr1 mbstr2 \ nastyparm negtime next nondec nondec2 \ patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \ profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 pty1 \ @@ -2143,6 +2146,12 @@ mbstr1:: AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +mbstr2:: + @echo $@ + @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \ + AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + printfbad2: printfbad2.ok @echo $@ @$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@ diff --git a/test/mbstr1.ok b/test/mbstr1.ok index dcb4347b..3fd1bf8e 100644 --- a/test/mbstr1.ok +++ b/test/mbstr1.ok @@ -1,2 +1,3 @@ +gawk: mbstr1.awk:2: warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale. 4 1 diff --git a/test/mbstr2.awk b/test/mbstr2.awk new file mode 100644 index 00000000..4f2c8cc6 --- /dev/null +++ b/test/mbstr2.awk @@ -0,0 +1 @@ +match($0,/:deathdate=2007....:/) { print substr($0,RSTART+11,RLENGTH-16) } diff --git a/test/mbstr2.in b/test/mbstr2.in new file mode 100644 index 00000000..36e971a6 --- /dev/null +++ b/test/mbstr2.in @@ -0,0 +1,4 @@ +missile:deathdate=20070306: +P”rr”:deathdate=20070306: +wizard:deathdate=20071103: +Daithí:deathdate=20071103: diff --git a/test/mbstr2.ok b/test/mbstr2.ok new file mode 100644 index 00000000..29ac876a --- /dev/null +++ b/test/mbstr2.ok @@ -0,0 +1,5 @@ +2007 +gawk: mbstr2.awk:1: (FILENAME=- FNR=2) warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale. +2007 +2007 +2007 |