aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2015-08-25 20:31:47 +0300
committerArnold D. Robbins <arnold@skeeve.com>2015-08-25 20:31:47 +0300
commit0d974ba125c1e8df68fecc2f579bb067d056cc3f (patch)
tree572046d03c2435c61497077d6dc16b85497adb45
parente0c35d9b6a94412c5c64a90953a33440b67c8a03 (diff)
parent243b097279a89d456fda4a400412482d70b3665c (diff)
downloadegawk-0d974ba125c1e8df68fecc2f579bb067d056cc3f.tar.gz
egawk-0d974ba125c1e8df68fecc2f579bb067d056cc3f.tar.bz2
egawk-0d974ba125c1e8df68fecc2f579bb067d056cc3f.zip
Merge branch 'gawk-4.1-stable'
-rw-r--r--ChangeLog18
-rw-r--r--awk.h31
-rw-r--r--mpfr.c4
-rw-r--r--node.c42
-rw-r--r--test/ChangeLog6
-rw-r--r--test/Makefile.am11
-rw-r--r--test/Makefile.in11
-rw-r--r--test/mbstr1.ok1
-rw-r--r--test/mbstr2.awk1
-rw-r--r--test/mbstr2.in4
-rw-r--r--test/mbstr2.ok5
11 files changed, 117 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog
index ff048b2f..f80edb7f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2015-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
+ * node.c (str2wstr): Upon finding an invalid character, if
+ using UTF-8, use the replacement character instead of skipping
+ it. Helps match() and other functions work better in the face
+ of unexpected data. Make the lint warning an unconditional
+ warning.
+
+ Unrelated:
+
+ * awk.h: Add explanatory comment on the flags related to
+ types and values.
+ * mpfr.c (mpg_force_number): If setting NUMBER, clear STRING also
+ when clearing MAYBE_NUM.
+ (set_PREC): Check STRCUR instead of STRING.
+ * node.c (r_force_number): If setting NUMBER, clear STRING also
+ when clearing MAYBE_NUM.
+
2015-08-15 Arnold D. Robbins <arnold@skeeve.com>
* dfa.c (dfamust): Restore c90 compat by moving some
diff --git a/awk.h b/awk.h
index 53af7f1b..8bf78fcd 100644
--- a/awk.h
+++ b/awk.h
@@ -404,6 +404,37 @@ typedef struct exp_node {
# define MALLOC 0x0001 /* can be free'd */
/* type = Node_val */
+ /*
+ * STRING and NUMBER are mutually exclusive. They represent the
+ * type of a value as assigned.
+ *
+ * STRCUR and NUMCUR are not mutually exclusive. They represent that
+ * the particular type of value is up to date. For example,
+ *
+ * a = 5 # NUMBER | NUMCUR
+ * b = a "" # Adds STRCUR to a, since a string value
+ * # is now available. But the type hasn't changed!
+ *
+ * a = "42" # STRING | STRCUR
+ * b = a + 0 # Adds NUMCUR to a, since numeric value
+ * # is now available. But the type hasn't changed!
+ *
+ * MAYBE_NUM is the joker. It means "this is string data, but
+ * the user may have really wanted it to be a number. If we have
+ * to guess, like in a comparison, turn it into a number."
+ * For example, gawk -v a=42 ....
+ * Here, `a' gets STRING|STRCUR|MAYBE_NUM and then when used where
+ * a number is needed, it gets turned into a NUMBER and STRING
+ * is cleared.
+ *
+ * WSTRCUR is for efficiency. If in a multibyte locale, and we
+ * need to do something character based (substr, length, etc.)
+ * we create the corresponding wide character string and store it,
+ * and add WSTRCUR to the flags so that we don't have to do the
+ * conversion more than once.
+ *
+ * We hope that the rest of the flags are self-explanatory. :-)
+ */
# define STRING 0x0002 /* assigned as string */
# define STRCUR 0x0004 /* string value is current */
# define NUMCUR 0x0008 /* numeric value is current */
diff --git a/mpfr.c b/mpfr.c
index 080ed7fa..b7526cfb 100644
--- a/mpfr.c
+++ b/mpfr.c
@@ -347,7 +347,7 @@ mpg_force_number(NODE *n)
return n;
if ((n->flags & MAYBE_NUM) != 0) {
- n->flags &= ~MAYBE_NUM;
+ n->flags &= ~(MAYBE_NUM|STRING);
newflags = NUMBER;
}
@@ -525,7 +525,7 @@ set_PREC()
if ((val->flags & MAYBE_NUM) != 0)
force_number(val);
- if ((val->flags & (STRING|NUMBER)) == STRING) {
+ if ((val->flags & STRCUR) != 0) {
int i, j;
/* emulate IEEE-754 binary format */
diff --git a/node.c b/node.c
index b15159f0..9227cf2d 100644
--- a/node.c
+++ b/node.c
@@ -76,7 +76,7 @@ r_force_number(NODE *n)
return n;
} else if (n->stlen == 4 && is_ieee_magic_val(n->stptr)) {
if ((n->flags & MAYBE_NUM) != 0)
- n->flags &= ~MAYBE_NUM;
+ n->flags &= ~(MAYBE_NUM|STRING);
n->flags |= NUMBER|NUMCUR;
n->numbr = get_ieee_magic_val(n->stptr);
@@ -103,7 +103,7 @@ r_force_number(NODE *n)
if ((n->flags & MAYBE_NUM) != 0) {
newflags = NUMBER;
- n->flags &= ~MAYBE_NUM;
+ n->flags &= ~(MAYBE_NUM|STRING);
} else
newflags = 0;
@@ -716,22 +716,37 @@ str2wstr(NODE *n, size_t **ptr)
case (size_t) -2:
case (size_t) -1:
/*
- * Just skip the bad byte and keep going, so that
- * we get a more-or-less full string, instead of
- * stopping early. This is particularly important
- * for match() where we need to build the indices.
- */
- sp++;
- src_count--;
- /*
* mbrtowc(3) says the state of mbs becomes undefined
* after a bad character, so reset it.
*/
memset(& mbs, 0, sizeof(mbs));
- /* And warn the user something's wrong */
- if (do_lint && ! warned) {
+
+ /* Warn the user something's wrong */
+ if (! warned) {
warned = true;
- lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+ warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+ }
+
+ /*
+ * 8/2015: If we're using UTF, then instead of just
+ * skipping the character, plug in the Unicode
+ * replacement character. In most cases this gives
+ * us "better" results, in that character counts
+ * and string lengths tend to make more sense.
+ *
+ * Otherwise, just skip the bad byte and keep going,
+ * so that we get a more-or-less full string, instead of
+ * stopping early. This is particularly important
+ * for match() where we need to build the indices.
+ */
+ if (using_utf8()) {
+ count = 1;
+ wc = 0xFFFD; /* unicode replacement character */
+ goto set_wc;
+ } else {
+ /* skip it and keep going */
+ sp++;
+ src_count--;
}
break;
@@ -739,6 +754,7 @@ str2wstr(NODE *n, size_t **ptr)
count = 1;
/* fall through */
default:
+ set_wc:
*wsp++ = wc;
src_count -= count;
while (count--) {
diff --git a/test/ChangeLog b/test/ChangeLog
index 8cc7915e..736860db 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,9 @@
+2015-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
+ * mbstr1.ok: Updated after code change.
+ * Makefile.am (mbstr2): New test.
+ * mbstr2.awk, mbstr2.in, mbstr2.ok: New files.
+
2015-06-29 Arnold D. Robbins <arnold@skeeve.com>
* Makefile.am (dbugeval2, typedregex3): New tests.
diff --git a/test/Makefile.am b/test/Makefile.am
index b9e14468..68fd709f 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -547,6 +547,9 @@ EXTRA_DIST = \
mbprintf4.ok \
mbstr1.awk \
mbstr1.ok \
+ mbstr2.awk \
+ mbstr2.in \
+ mbstr2.ok \
membug1.awk \
membug1.in \
membug1.ok \
@@ -1107,7 +1110,7 @@ GAWK_EXT_TESTS = \
incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \
include include2 indirectbuiltin indirectcall indirectcall2 \
lint lintold lintwarn \
- manyfiles match1 match2 match3 mbstr1 \
+ manyfiles match1 match2 match3 mbstr1 mbstr2 \
nastyparm negtime next nondec nondec2 \
nonfatal1 nonfatal2 nonfatal3 \
patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \
@@ -1760,6 +1763,12 @@ mbstr1::
AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+mbstr2::
+ @echo $@
+ @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \
+ AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
printfbad2: printfbad2.ok
@echo $@
@$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@
diff --git a/test/Makefile.in b/test/Makefile.in
index 0ae51917..1fe8da66 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -804,6 +804,9 @@ EXTRA_DIST = \
mbprintf4.ok \
mbstr1.awk \
mbstr1.ok \
+ mbstr2.awk \
+ mbstr2.in \
+ mbstr2.ok \
membug1.awk \
membug1.in \
membug1.ok \
@@ -1363,7 +1366,7 @@ GAWK_EXT_TESTS = \
incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \
include include2 indirectbuiltin indirectcall indirectcall2 \
lint lintold lintwarn \
- manyfiles match1 match2 match3 mbstr1 \
+ manyfiles match1 match2 match3 mbstr1 mbstr2 \
nastyparm negtime next nondec nondec2 \
nonfatal1 nonfatal2 nonfatal3 \
patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \
@@ -2198,6 +2201,12 @@ mbstr1::
AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+mbstr2::
+ @echo $@
+ @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \
+ AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
printfbad2: printfbad2.ok
@echo $@
@$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@
diff --git a/test/mbstr1.ok b/test/mbstr1.ok
index dcb4347b..3fd1bf8e 100644
--- a/test/mbstr1.ok
+++ b/test/mbstr1.ok
@@ -1,2 +1,3 @@
+gawk: mbstr1.awk:2: warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale.
4
1
diff --git a/test/mbstr2.awk b/test/mbstr2.awk
new file mode 100644
index 00000000..4f2c8cc6
--- /dev/null
+++ b/test/mbstr2.awk
@@ -0,0 +1 @@
+match($0,/:deathdate=2007....:/) { print substr($0,RSTART+11,RLENGTH-16) }
diff --git a/test/mbstr2.in b/test/mbstr2.in
new file mode 100644
index 00000000..36e971a6
--- /dev/null
+++ b/test/mbstr2.in
@@ -0,0 +1,4 @@
+missile:deathdate=20070306:
+P”rr”:deathdate=20070306:
+wizard:deathdate=20071103:
+Daithí:deathdate=20071103:
diff --git a/test/mbstr2.ok b/test/mbstr2.ok
new file mode 100644
index 00000000..29ac876a
--- /dev/null
+++ b/test/mbstr2.ok
@@ -0,0 +1,5 @@
+2007
+gawk: mbstr2.awk:1: (FILENAME=- FNR=2) warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale.
+2007
+2007
+2007