aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--node.c38
-rw-r--r--test/ChangeLog6
-rw-r--r--test/Makefile.am11
-rw-r--r--test/Makefile.in11
-rw-r--r--test/mbstr1.ok1
-rw-r--r--test/mbstr2.awk1
-rw-r--r--test/mbstr2.in4
-rw-r--r--test/mbstr2.ok5
9 files changed, 72 insertions, 13 deletions
diff --git a/ChangeLog b/ChangeLog
index 62647c1a..05f53423 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2015-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
+ * node.c (str2wstr): Upon finding an invalid character, if
+ using UTF-8, use the replacement character instead of skipping
+ it. Helps match() and other functions work better in the face
+ of unexpected data. Make the lint warning an unconditional
+ warning.
+
2015-08-15 Arnold D. Robbins <arnold@skeeve.com>
* dfa.c (dfamust): Restore c90 compat by moving some
diff --git a/node.c b/node.c
index 1741a13b..de771147 100644
--- a/node.c
+++ b/node.c
@@ -717,22 +717,37 @@ str2wstr(NODE *n, size_t **ptr)
case (size_t) -2:
case (size_t) -1:
/*
- * Just skip the bad byte and keep going, so that
- * we get a more-or-less full string, instead of
- * stopping early. This is particularly important
- * for match() where we need to build the indices.
- */
- sp++;
- src_count--;
- /*
* mbrtowc(3) says the state of mbs becomes undefined
* after a bad character, so reset it.
*/
memset(& mbs, 0, sizeof(mbs));
- /* And warn the user something's wrong */
- if (do_lint && ! warned) {
+
+ /* Warn the user something's wrong */
+ if (! warned) {
warned = true;
- lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+ warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+ }
+
+ /*
+ * 8/2015: If we're using UTF, then instead of just
+ * skipping the character, plug in the Unicode
+ * replacement character. In most cases this gives
+ * us "better" results, in that character counts
+ * and string lengths tend to make more sense.
+ *
+ * Otherwise, just skip the bad byte and keep going,
+ * so that we get a more-or-less full string, instead of
+ * stopping early. This is particularly important
+ * for match() where we need to build the indices.
+ */
+ if (using_utf8()) {
+ count = 1;
+ wc = 0xFFFD; /* unicode replacement character */
+ goto set_wc;
+ } else {
+ /* skip it and keep going */
+ sp++;
+ src_count--;
}
break;
@@ -740,6 +755,7 @@ str2wstr(NODE *n, size_t **ptr)
count = 1;
/* fall through */
default:
+ set_wc:
*wsp++ = wc;
src_count -= count;
while (count--) {
diff --git a/test/ChangeLog b/test/ChangeLog
index cc7576ef..7b9e273a 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,9 @@
+2015-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
+ * mbstr1.ok: Updated after code change.
+ * Makefile.am (mbstr2): New test.
+ * mbstr2.awk, mbstr2.in, mbstr2.ok: New files.
+
2015-06-25 Arnold D. Robbins <arnold@skeeve.com>
* Makefile.am (negtime): Fix out-of-tree test run.
diff --git a/test/Makefile.am b/test/Makefile.am
index 5e72014b..14ebf544 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -531,6 +531,9 @@ EXTRA_DIST = \
mbprintf4.ok \
mbstr1.awk \
mbstr1.ok \
+ mbstr2.awk \
+ mbstr2.in \
+ mbstr2.ok \
membug1.awk \
membug1.in \
membug1.ok \
@@ -1062,7 +1065,7 @@ GAWK_EXT_TESTS = \
incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \
include include2 indirectbuiltin indirectcall indirectcall2 \
lint lintold lintwarn \
- manyfiles match1 match2 match3 mbstr1 \
+ manyfiles match1 match2 match3 mbstr1 mbstr2 \
nastyparm negtime next nondec nondec2 \
patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \
profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 pty1 \
@@ -1705,6 +1708,12 @@ mbstr1::
AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+mbstr2::
+ @echo $@
+ @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \
+ AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
printfbad2: printfbad2.ok
@echo $@
@$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@
diff --git a/test/Makefile.in b/test/Makefile.in
index 0116f3a4..a78b3e6a 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -788,6 +788,9 @@ EXTRA_DIST = \
mbprintf4.ok \
mbstr1.awk \
mbstr1.ok \
+ mbstr2.awk \
+ mbstr2.in \
+ mbstr2.ok \
membug1.awk \
membug1.in \
membug1.ok \
@@ -1318,7 +1321,7 @@ GAWK_EXT_TESTS = \
incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \
include include2 indirectbuiltin indirectcall indirectcall2 \
lint lintold lintwarn \
- manyfiles match1 match2 match3 mbstr1 \
+ manyfiles match1 match2 match3 mbstr1 mbstr2 \
nastyparm negtime next nondec nondec2 \
patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \
profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 pty1 \
@@ -2143,6 +2146,12 @@ mbstr1::
AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+mbstr2::
+ @echo $@
+ @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \
+ AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
printfbad2: printfbad2.ok
@echo $@
@$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@
diff --git a/test/mbstr1.ok b/test/mbstr1.ok
index dcb4347b..3fd1bf8e 100644
--- a/test/mbstr1.ok
+++ b/test/mbstr1.ok
@@ -1,2 +1,3 @@
+gawk: mbstr1.awk:2: warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale.
4
1
diff --git a/test/mbstr2.awk b/test/mbstr2.awk
new file mode 100644
index 00000000..4f2c8cc6
--- /dev/null
+++ b/test/mbstr2.awk
@@ -0,0 +1 @@
+match($0,/:deathdate=2007....:/) { print substr($0,RSTART+11,RLENGTH-16) }
diff --git a/test/mbstr2.in b/test/mbstr2.in
new file mode 100644
index 00000000..36e971a6
--- /dev/null
+++ b/test/mbstr2.in
@@ -0,0 +1,4 @@
+missile:deathdate=20070306:
+P”rr”:deathdate=20070306:
+wizard:deathdate=20071103:
+Daithí:deathdate=20071103:
diff --git a/test/mbstr2.ok b/test/mbstr2.ok
new file mode 100644
index 00000000..29ac876a
--- /dev/null
+++ b/test/mbstr2.ok
@@ -0,0 +1,5 @@
+2007
+gawk: mbstr2.awk:1: (FILENAME=- FNR=2) warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale.
+2007
+2007
+2007