summaryrefslogtreecommitdiffstats
path: root/utf8.c
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-02-05 07:56:38 +0100
committerKaz Kylheku <kaz@kylheku.com>2012-02-05 07:56:38 +0100
commit4c6802f7284e17b7cbc1f178f90775182d379be6 (patch)
tree0023ac62d824c12e3b6508c9665e27dd4af28ea4 /utf8.c
parent6949749e00019594d17a2dd7788dadd1663aff64 (diff)
downloadtxr-4c6802f7284e17b7cbc1f178f90775182d379be6.tar.gz
txr-4c6802f7284e17b7cbc1f178f90775182d379be6.tar.bz2
txr-4c6802f7284e17b7cbc1f178f90775182d379be6.zip
* utf8.c (utf8_from_uc, utf8_decode): Some cascaded if tests converted
to a switch on the upper nybble value. This also fixes an unfortunate bug. The test for the two byte case was written as ch >= 0xc2 && ch <= 0xE0. That should have been ch < 0xE0. Versions of TXR up to 55 have been incorrectly decoding some UTF-8.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c50
1 files changed, 34 insertions, 16 deletions
diff --git a/utf8.c b/utf8.c
index d4ca3513..d61d7073 100644
--- a/utf8.c
+++ b/utf8.c
@@ -69,30 +69,40 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
switch (state) {
case utf8_init:
- if (ch < 0x80) {
+ switch (ch >> 4) {
+ case 0x0: case 0x1: case 0x2: case 0x3:
+ case 0x4: case 0x5: case 0x6: case 0x7:
if (wdst)
*wdst++ = ch;
nchar++;
- } else if (ch >= 0xC2 && ch <= 0xE0) {
+ break;
+ case 0xC: case 0xD:
state = utf8_more1;
wch = (ch & 0x1F);
wch_min = 0x80;
- } else if (ch >= 0xE0 && ch <= 0xEF) {
+ break;
+ case 0xE:
state = utf8_more2;
wch = (ch & 0xF);
wch_min = 0x800;
- } else if (ch >= 0xF0 && ch < 0xF5) {
+ break;
+ case 0xF:
#ifdef FULL_UNICODE
- state = utf8_more3;
- wch = (ch & 0x7);
- wch_min = 0x10000;
+ if (ch < 0xF5) {
+ state = utf8_more3;
+ wch = (ch & 0x7);
+ wch_min = 0x10000;
+ break;
+ }
+ /* fallthrough */
#else
conversion_error();
#endif
- } else {
+ default:
if (wdst)
*wdst++ = 0xDC00 | ch;
nchar++;
+ break;
}
backtrack = src;
break;
@@ -279,26 +289,34 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
switch (ud->state) {
case utf8_init:
- if (ch < 0x80) {
+ switch (ch >> 4) {
+ case 0x0: case 0x1: case 0x2: case 0x3:
+ case 0x4: case 0x5: case 0x6: case 0x7:
ud->back = ud->tail;
return ch;
- } else if (ch >= 0xC0 && ch <= 0xE0) {
+ case 0xC: case 0xD:
ud->state = utf8_more1;
ud->wch = (ch & 0x1F);
ud->wch_min = 0x80;
- } else if (ch >= 0xE0 && ch <= 0xEF) {
+ break;
+ case 0xE:
ud->state = utf8_more2;
ud->wch = (ch & 0xF);
ud->wch_min = 0x800;
- } else if (ch >= 0xF0 && ch < 0xF5) {
+ break;
+ case 0xF:
#ifdef FULL_UNICODE
- ud->state = utf8_more3;
- ud->wch = (ch & 0x7);
- ud->wch_min = 0x100000;
+ if (ch < 0xF5) {
+ ud->state = utf8_more3;
+ ud->wch = (ch & 0x7);
+ ud->wch_min = 0x100000;
+ break;
+ }
+ /* fallthrough */
#else
conversion_error();
#endif
- } else {
+ default:
ud->back = ud->tail;
return 0xDC00 | ch;
}