From 8d8bf5a5e292a2b436ad4d5dedd4595ecf15f9ee Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Wed, 25 Feb 2009 09:10:09 +0000 Subject: * mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8 sequences since they are invalid in the Unicode standard. Handle surrogate pairs in case of wchar_t == UTF-16. * wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t values beyond 0x10ffff into UTF-8 chars. Handle surrogate pairs in case of wchar_t == UTF-16. --- newlib/libc/stdlib/mbtowc_r.c | 153 +++++++++--------------------------------- 1 file changed, 32 insertions(+), 121 deletions(-) (limited to 'newlib/libc/stdlib/mbtowc_r.c') diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c index 71bbf8537..00021beff 100644 --- a/newlib/libc/stdlib/mbtowc_r.c +++ b/newlib/libc/stdlib/mbtowc_r.c @@ -75,6 +75,18 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state), if (s == NULL) return 0; /* UTF-8 character encodings are not state-dependent */ + if (state->__count == 4) + { + /* Create the second half of the surrogate pair. For a description + see the comment below. */ + wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18) + | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12) + | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6) + | (wchar_t)(state->__value.__wchb[3] & 0x3f); + state->__count = 0; + *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff); + return 2; + } if (state->__count == 0) ch = t[i++]; else @@ -153,8 +165,7 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state), else if (ch >= 0xf0 && ch <= 0xf7) { /* four-byte sequence */ - if (sizeof(wchar_t) < 4) - return -1; /* we can't store such a value */ + wint_t tmp; state->__value.__wchb[0] = ch; if (state->__count == 0) state->__count = 1; @@ -185,125 +196,25 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state), ch = t[i++]; if (ch < 0x80 || ch > 0xbf) return -1; - *pwc = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18) - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12) - | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6) - | (wchar_t)(ch & 0x3f); - - state->__count = 0; - return i; - } - else if (ch >= 0xf8 && ch <= 0xfb) - { - /* five-byte sequence */ - if (sizeof(wchar_t) < 4) - return -1; /* we can't store such a value */ - state->__value.__wchb[0] = ch; - if (state->__count == 0) - state->__count = 1; - else if (n < (size_t)-1) - ++n; - if (n < 2) - return -2; - ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; - if (state->__value.__wchb[0] == 0xf8 && ch < 0x88) - /* overlong UTF-8 sequence */ - return -1; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[1] = ch; - if (state->__count == 1) - state->__count = 2; - else if (n < (size_t)-1) - ++n; - if (n < 3) - return -2; - ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2]; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[2] = ch; - if (state->__count == 2) - state->__count = 3; - else if (n < (size_t)-1) - ++n; - if (n < 4) - return -2; - ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3]; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[3] = ch; - state->__count = 4; - if (n < 5) - return -2; - ch = t[i++]; - *pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24) - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18) - | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12) - | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6) - | (wchar_t)(ch & 0x3f); - - state->__count = 0; - return i; - } - else if (ch >= 0xfc && ch <= 0xfd) - { - /* six-byte sequence */ - int ch2; - if (sizeof(wchar_t) < 4) - return -1; /* we can't store such a value */ - state->__value.__wchb[0] = ch; - if (state->__count == 0) - state->__count = 1; - else if (n < (size_t)-1) - ++n; - if (n < 2) - return -2; - ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; - if (state->__value.__wchb[0] == 0xfc && ch < 0x84) - /* overlong UTF-8 sequence */ - return -1; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[1] = ch; - if (state->__count == 1) - state->__count = 2; - else if (n < (size_t)-1) - ++n; - if (n < 3) - return -2; - ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2]; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[2] = ch; - if (state->__count == 2) - state->__count = 3; - else if (n < (size_t)-1) - ++n; - if (n < 4) - return -2; - ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3]; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[3] = ch; - if (state->__count == 3) - state->__count = 4; - else if (n < (size_t)-1) - ++n; - if (n < 5) - return -2; - if (n == 5) - return -1; /* at this point we can't save enough to restart */ - ch = t[i++]; - if (ch < 0x80 || ch > 0xbf) - return -1; - ch2 = t[i++]; - *pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30) - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24) - | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18) - | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12) - | (wchar_t)((ch & 0x3f) << 6) - | (wchar_t)(ch2 & 0x3f); - + tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18) + | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12) + | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6) + | (wint_t)(ch & 0x3f); + if (tmp > 0xffff && sizeof(wchar_t) == 2) + { + /* On systems which have wchar_t being UTF-16 values, the value + doesn't fit into a single wchar_t in this case. So what we + do here is to store the state with a special value of __count + and return the first half of a surrogate pair. As return + value we choose to return the half of the actual UTF-8 char. + The second half is returned in case we recognize the special + __count value above. */ + state->__value.__wchb[3] = ch; + state->__count = 4; + *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff); + return 2; + } + *pwc = tmp; state->__count = 0; return i; } -- cgit v1.2.3