aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog11
-rw-r--r--TODO4
-rw-r--r--awk.h4
-rw-r--r--io.c12
-rw-r--r--main.c3
-rw-r--r--node.c21
6 files changed, 45 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index f9e4e1d1..da63c9d9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+Sat Dec 4 21:44:38 2010 Arnold D. Robbins <arnold@skeeve.com>
+
+ * node.c (init_btowc_cache): New function.
+ (btowc_cache): New array.
+ (str2wstr): Use is_valid_character in test instead of several isXXX
+ calls.
+ * awk.h [is_valid_character]: Macro to use btowc_cache.
+ * main.c (main): Call init_btowc_cache().
+ * io.c (rs1scan): Add call to is_valid_character when processing
+ characters byte by byte.
+
Wed Dec 1 08:10:21 2010 Arnold D. Robbins <arnold@skeeve.com>
* awk.h, awkgram.y, debug.c: Change CONTEXT to AWK_CONTEXT
diff --git a/TODO b/TODO
index 02dbcc28..9276532a 100644
--- a/TODO
+++ b/TODO
@@ -1,9 +1,7 @@
FIX regular field splitting to use FPAT algorithm.
#Revise plug-in code to check for special symbol per GNU coding standards.
-
-Paolo's code for checking for single char values in str2wstr().
-
+#Paolo's code for checking for single char values in str2wstr().
#Fix os_close_on_exec to do read/modify/write of the fd flags.
#Consider forcing [a-z] int abc...wxyz in regexes, no matter what the locale.
diff --git a/awk.h b/awk.h
index 993e5820..8a5c6eb8 100644
--- a/awk.h
+++ b/awk.h
@@ -1323,8 +1323,12 @@ extern const wchar_t *wstrstr(const wchar_t *haystack, size_t hs_len,
extern const wchar_t *wcasestrstr(const wchar_t *haystack, size_t hs_len,
const wchar_t *needle, size_t needle_len);
extern void free_wstr(NODE *n);
+extern wint_t btowc_cache[];
+extern void init_btowc_cache();
+#define is_valid_character(b) (btowc_cache[b] != WEOF)
#else
#define free_wstr(NODE) /* empty */
+#define is_valid_character(c) (TRUE)
#endif
/* re.c */
extern Regexp *make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal);
diff --git a/io.c b/io.c
index 54fa46bb..efad6af3 100644
--- a/io.c
+++ b/io.c
@@ -2684,13 +2684,19 @@ rs1scan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state)
if (rs != '\n' && gawk_mb_cur_max > 1) {
int len = iop->dataend - bp;
int found = 0;
+
memset(&mbs, 0, sizeof(mbstate_t));
do {
if (*bp == rs)
found = 1;
- mbclen = mbrlen(bp, len, &mbs);
- if ((mbclen == 1) || (mbclen == (size_t) -1)
- || (mbclen == (size_t) -2) || (mbclen == 0)) {
+ if (is_valid_character(*bp))
+ mbclen = 1;
+ else
+ mbclen = mbrlen(bp, len, &mbs);
+ if ( (mbclen == 1)
+ || (mbclen == (size_t) -1)
+ || (mbclen == (size_t) -2)
+ || (mbclen == 0)) {
/* We treat it as a singlebyte character. */
mbclen = 1;
}
diff --git a/main.c b/main.c
index b09027cf..01561b47 100644
--- a/main.c
+++ b/main.c
@@ -293,6 +293,9 @@ main(int argc, char **argv)
*/
gawk_mb_cur_max = MB_CUR_MAX;
/* Without MBS_SUPPORT, gawk_mb_cur_max is 1. */
+
+ /* init the cache for checking bytes if they're characters */
+ init_btowc_cache();
#endif
(void) bindtextdomain(PACKAGE, LOCALEDIR);
diff --git a/node.c b/node.c
index d181bff9..0dd5f52a 100644
--- a/node.c
+++ b/node.c
@@ -706,11 +706,9 @@ str2wstr(NODE *n, size_t **ptr)
* 9/2010: Check the current byte; if it's a valid character,
* then it doesn't start a multibyte sequence. This brings a
* big speed up. Thanks to Ulrich Drepper for the tip.
+ * 11/2010: Thanks to Paolo Bonzini for some even faster code.
*/
- if ( isprint(*sp)
- || isgraph(*sp)
- || iscntrl(*sp)
- || *sp == '\0' ) {
+ if (is_valid_character(*sp)) {
count = 1;
wc = *sp;
} else
@@ -894,3 +892,18 @@ get_ieee_magic_val(const char *val)
return v;
}
+
+#ifdef MBS_SUPPORT
+wint_t btowc_cache[256];
+
+/* init_btowc_cache --- initialize the cache */
+
+void init_btowc_cache()
+{
+ int i;
+
+ for (i = 0; i < 255; i++) {
+ btowc_cache[i] = btowc(i);
+ }
+}
+#endif