diff options
Diffstat (limited to 'newlib/libc/stdlib/mbtowc_r.c')
-rw-r--r-- | newlib/libc/stdlib/mbtowc_r.c | 819 |
1 files changed, 502 insertions, 317 deletions
diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c index d3856e041..78bde4b78 100644 --- a/newlib/libc/stdlib/mbtowc_r.c +++ b/newlib/libc/stdlib/mbtowc_r.c @@ -5,10 +5,53 @@ #include <wchar.h> #include <string.h> #include <errno.h> +#include "local.h" -#ifdef _MB_CAPABLE -extern char *__locale_charset (); +int (*__mbtowc) (struct _reent *, wchar_t *, const char *, size_t, + const char *, mbstate_t *) + = __ascii_mbtowc; + +int +_DEFUN (_mbtowc_r, (r, pwc, s, n, state), + struct _reent *r _AND + wchar_t *pwc _AND + const char *s _AND + size_t n _AND + mbstate_t *state) +{ + return __mbtowc (r, pwc, s, n, __locale_charset (), state); +} +int +_DEFUN (__ascii_mbtowc, (r, pwc, s, n, charset, state), + struct _reent *r _AND + wchar_t *pwc _AND + const char *s _AND + size_t n _AND + const char *charset _AND + mbstate_t *state) +{ + wchar_t dummy; + unsigned char *t = (unsigned char *)s; + + if (pwc == NULL) + pwc = &dummy; + + if (s == NULL) + return 0; + + if (n == 0) + return -2; + + *pwc = (wchar_t)*t; + + if (*t == '\0') + return 0; + + return 1; +} + +#ifdef _MB_CAPABLE typedef enum { ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER, JIS_C_NUM } JIS_CHAR_TYPE; typedef enum { ASCII, JIS, A_ESC, A_ESC_DL, JIS_1, J_ESC, J_ESC_BR, @@ -43,17 +86,18 @@ static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = { /* J_ESC */ { ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR }, /* J_ESC_BR */{ ERROR, ERROR, ERROR, ERROR, MAKE_A, MAKE_A, ERROR, ERROR, ERROR }, }; -#endif /* _MB_CAPABLE */ /* we override the mbstate_t __count field for more complex encodings and use it store a state value */ #define __state __count +#ifdef _MB_EXTENDED_CHARSETS_ISO int -_DEFUN (_mbtowc_r, (r, pwc, s, n, state), - struct _reent *r _AND - wchar_t *pwc _AND - const char *s _AND - size_t n _AND +_DEFUN (__iso_mbtowc, (r, pwc, s, n, charset, state), + struct _reent *r _AND + wchar_t *pwc _AND + const char *s _AND + size_t n _AND + const char *charset _AND mbstate_t *state) { wchar_t dummy; @@ -62,358 +106,394 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state), if (pwc == NULL) pwc = &dummy; - if (s != NULL && n == 0) + if (s == NULL) + return 0; + + if (n == 0) return -2; -#ifdef _MB_CAPABLE - if (strlen (__locale_charset ()) <= 1) - { /* fall-through */ } - else if (!strcmp (__locale_charset (), "UTF-8")) + if (*t >= 0xa0) { - int ch; - int i = 0; - - if (s == NULL) - return 0; /* UTF-8 character encodings are not state-dependent */ - - if (state->__count == 4) + int iso_idx = __iso_8859_index (charset + 9); + if (iso_idx >= 0) { - /* Create the second half of the surrogate pair. For a description - see the comment below. */ - wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18) - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12) - | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6) - | (wchar_t)(state->__value.__wchb[3] & 0x3f); - state->__count = 0; - *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff); - return 2; - } - if (state->__count == 0) - ch = t[i++]; - else - { - if (n < (size_t)-1) - ++n; - ch = state->__value.__wchb[0]; - } - - if (ch == '\0') - { - *pwc = 0; - state->__count = 0; - return 0; /* s points to the null character */ - } - - if (ch >= 0x0 && ch <= 0x7f) - { - /* single-byte sequence */ - state->__count = 0; - *pwc = ch; - return 1; - } - else if (ch >= 0xc0 && ch <= 0xdf) - { - /* two-byte sequence */ - state->__value.__wchb[0] = ch; - state->__count = 1; - if (n < 2) - return -2; - ch = t[i++]; - if (ch < 0x80 || ch > 0xbf) + *pwc = __iso_8859_conv[iso_idx][*t - 0xa0]; + if (*pwc == 0) /* Invalid character */ { r->_errno = EILSEQ; return -1; } - if (state->__value.__wchb[0] < 0xc2) - { - /* overlong UTF-8 sequence */ - r->_errno = EILSEQ; - return -1; - } - state->__count = 0; - *pwc = (wchar_t)((state->__value.__wchb[0] & 0x1f) << 6) - | (wchar_t)(ch & 0x3f); - return i; + return 1; } - else if (ch >= 0xe0 && ch <= 0xef) + } + + *pwc = (wchar_t) *t; + + if (*t == '\0') + return 0; + + return 1; +} +#endif /* _MB_EXTENDED_CHARSETS_ISO */ + +#ifdef _MB_EXTENDED_CHARSETS_WINDOWS +int +_DEFUN (__cp_mbtowc, (r, pwc, s, n, charset, state), + struct _reent *r _AND + wchar_t *pwc _AND + const char *s _AND + size_t n _AND + const char *charset _AND + mbstate_t *state) +{ + wchar_t dummy; + unsigned char *t = (unsigned char *)s; + + if (pwc == NULL) + pwc = &dummy; + + if (s == NULL) + return 0; + + if (n == 0) + return -2; + + if (*t >= 0x80) + { + int cp_idx = __cp_index (charset + 2); + if (cp_idx >= 0) { - /* three-byte sequence */ - wchar_t tmp; - state->__value.__wchb[0] = ch; - if (state->__count == 0) - state->__count = 1; - else if (n < (size_t)-1) - ++n; - if (n < 2) - return -2; - ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; - if (state->__value.__wchb[0] == 0xe0 && ch < 0xa0) - { - /* overlong UTF-8 sequence */ - r->_errno = EILSEQ; - return -1; - } - if (ch < 0x80 || ch > 0xbf) - { - r->_errno = EILSEQ; - return -1; - } - state->__value.__wchb[1] = ch; - state->__count = 2; - if (n < 3) - return -2; - ch = t[i++]; - if (ch < 0x80 || ch > 0xbf) + *pwc = __cp_conv[cp_idx][*t - 0x80]; + if (*pwc == 0) /* Invalid character */ { r->_errno = EILSEQ; return -1; } - state->__count = 0; - tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12) - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6) - | (wchar_t)(ch & 0x3f); - - if (tmp >= 0xd800 && tmp <= 0xdfff) - { - r->_errno = EILSEQ; - return -1; - } - *pwc = tmp; - return i; + return 1; } - else if (ch >= 0xf0 && ch <= 0xf7) + } + + *pwc = (wchar_t)*t; + + if (*t == '\0') + return 0; + + return 1; +} +#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */ + +int +_DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state), + struct _reent *r _AND + wchar_t *pwc _AND + const char *s _AND + size_t n _AND + const char *charset _AND + mbstate_t *state) +{ + wchar_t dummy; + unsigned char *t = (unsigned char *)s; + int ch; + int i = 0; + + if (pwc == NULL) + pwc = &dummy; + + if (s == NULL) + return 0; + + if (n == 0) + return -2; + + if (state->__count == 4) + { + /* Create the second half of the surrogate pair. For a description + see the comment below. */ + wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18) + | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12) + | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6) + | (wchar_t)(state->__value.__wchb[3] & 0x3f); + state->__count = 0; + *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff); + return 2; + } + if (state->__count == 0) + ch = t[i++]; + else + { + if (n < (size_t)-1) + ++n; + ch = state->__value.__wchb[0]; + } + + if (ch == '\0') + { + *pwc = 0; + state->__count = 0; + return 0; /* s points to the null character */ + } + + if (ch >= 0x0 && ch <= 0x7f) + { + /* single-byte sequence */ + state->__count = 0; + *pwc = ch; + return 1; + } + if (ch >= 0xc0 && ch <= 0xdf) + { + /* two-byte sequence */ + state->__value.__wchb[0] = ch; + state->__count = 1; + if (n < 2) + return -2; + ch = t[i++]; + if (ch < 0x80 || ch > 0xbf) { - /* four-byte sequence */ - wint_t tmp; - state->__value.__wchb[0] = ch; - if (state->__count == 0) - state->__count = 1; - else if (n < (size_t)-1) - ++n; - if (n < 2) - return -2; - ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; - if (state->__value.__wchb[0] == 0xf0 && ch < 0x90) - { - /* overlong UTF-8 sequence */ - r->_errno = EILSEQ; - return -1; - } - if (ch < 0x80 || ch > 0xbf) - { - r->_errno = EILSEQ; - return -1; - } - state->__value.__wchb[1] = ch; - if (state->__count == 1) - state->__count = 2; - else if (n < (size_t)-1) - ++n; - if (n < 3) - return -2; - ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2]; - if (ch < 0x80 || ch > 0xbf) - { - r->_errno = EILSEQ; - return -1; - } - state->__value.__wchb[2] = ch; - state->__count = 3; - if (n < 4) - return -2; - ch = t[i++]; - if (ch < 0x80 || ch > 0xbf) - { - r->_errno = EILSEQ; - return -1; - } - tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18) - | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12) - | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6) - | (wint_t)(ch & 0x3f); - if (tmp > 0xffff && sizeof(wchar_t) == 2) - { - /* On systems which have wchar_t being UTF-16 values, the value - doesn't fit into a single wchar_t in this case. So what we - do here is to store the state with a special value of __count - and return the first half of a surrogate pair. As return - value we choose to return the half of the actual UTF-8 char. - The second half is returned in case we recognize the special - __count value above. */ - state->__value.__wchb[3] = ch; - state->__count = 4; - *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff); - return 2; - } - *pwc = tmp; - state->__count = 0; - return i; + r->_errno = EILSEQ; + return -1; } - else + if (state->__value.__wchb[0] < 0xc2) { + /* overlong UTF-8 sequence */ r->_errno = EILSEQ; return -1; } - } - else if (!strcmp (__locale_charset (), "SJIS")) + state->__count = 0; + *pwc = (wchar_t)((state->__value.__wchb[0] & 0x1f) << 6) + | (wchar_t)(ch & 0x3f); + return i; + } + if (ch >= 0xe0 && ch <= 0xef) { - int ch; - int i = 0; - if (s == NULL) - return 0; /* not state-dependent */ - ch = t[i++]; + /* three-byte sequence */ + wchar_t tmp; + state->__value.__wchb[0] = ch; if (state->__count == 0) + state->__count = 1; + else if (n < (size_t)-1) + ++n; + if (n < 2) + return -2; + ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; + if (state->__value.__wchb[0] == 0xe0 && ch < 0xa0) { - if (_issjis1 (ch)) - { - state->__value.__wchb[0] = ch; - state->__count = 1; - if (n <= 1) - return -2; - ch = t[i++]; - } + /* overlong UTF-8 sequence */ + r->_errno = EILSEQ; + return -1; } - if (state->__count == 1) + if (ch < 0x80 || ch > 0xbf) { - if (_issjis2 (ch)) - { - *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch; - state->__count = 0; - return i; - } - else - { - r->_errno = EILSEQ; - return -1; - } + r->_errno = EILSEQ; + return -1; + } + state->__value.__wchb[1] = ch; + state->__count = 2; + if (n < 3) + return -2; + ch = t[i++]; + if (ch < 0x80 || ch > 0xbf) + { + r->_errno = EILSEQ; + return -1; } + state->__count = 0; + tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12) + | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6) + | (wchar_t)(ch & 0x3f); + + if (tmp >= 0xd800 && tmp <= 0xdfff) + { + r->_errno = EILSEQ; + return -1; + } + *pwc = tmp; + return i; } - else if (!strcmp (__locale_charset (), "EUCJP")) + if (ch >= 0xf0 && ch <= 0xf7) { - int ch; - int i = 0; - if (s == NULL) - return 0; /* not state-dependent */ - ch = t[i++]; + /* four-byte sequence */ + wint_t tmp; + state->__value.__wchb[0] = ch; if (state->__count == 0) + state->__count = 1; + else if (n < (size_t)-1) + ++n; + if (n < 2) + return -2; + ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; + if (state->__value.__wchb[0] == 0xf0 && ch < 0x90) { - if (_iseucjp (ch)) - { - state->__value.__wchb[0] = ch; - state->__count = 1; - if (n <= 1) - return -2; - ch = t[i++]; - } + /* overlong UTF-8 sequence */ + r->_errno = EILSEQ; + return -1; + } + if (ch < 0x80 || ch > 0xbf) + { + r->_errno = EILSEQ; + return -1; } + state->__value.__wchb[1] = ch; if (state->__count == 1) + state->__count = 2; + else if (n < (size_t)-1) + ++n; + if (n < 3) + return -2; + ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2]; + if (ch < 0x80 || ch > 0xbf) { - if (_iseucjp (ch)) - { - *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch; - state->__count = 0; - return i; - } - else - { - r->_errno = EILSEQ; - return -1; - } + r->_errno = EILSEQ; + return -1; + } + state->__value.__wchb[2] = ch; + state->__count = 3; + if (n < 4) + return -2; + ch = t[i++]; + if (ch < 0x80 || ch > 0xbf) + { + r->_errno = EILSEQ; + return -1; } + tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18) + | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12) + | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6) + | (wint_t)(ch & 0x3f); + if (tmp > 0xffff && sizeof(wchar_t) == 2) + { + /* On systems which have wchar_t being UTF-16 values, the value + doesn't fit into a single wchar_t in this case. So what we + do here is to store the state with a special value of __count + and return the first half of a surrogate pair. As return + value we choose to return the half of the actual UTF-8 char. + The second half is returned in case we recognize the special + __count value above. */ + state->__value.__wchb[3] = ch; + state->__count = 4; + *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff); + return 2; + } + *pwc = tmp; + state->__count = 0; + return i; } - else if (!strcmp (__locale_charset (), "JIS")) - { - JIS_STATE curr_state; - JIS_ACTION action; - JIS_CHAR_TYPE ch; - unsigned char *ptr; - unsigned int i; - int curr_ch; - - if (s == NULL) - { - state->__state = ASCII; - return 1; /* state-dependent */ - } - - curr_state = state->__state; - ptr = t; - - for (i = 0; i < n; ++i) - { - curr_ch = t[i]; - switch (curr_ch) - { - case ESC_CHAR: - ch = ESCAPE; - break; - case '$': - ch = DOLLAR; - break; - case '@': - ch = AT; - break; - case '(': - ch = BRACKET; - break; - case 'B': - ch = B; - break; - case 'J': - ch = J; - break; - case '\0': - ch = NUL; - break; - default: - if (_isjis (curr_ch)) - ch = JIS_CHAR; - else - ch = OTHER; - } - action = JIS_action_table[curr_state][ch]; - curr_state = JIS_state_table[curr_state][ch]; - - switch (action) - { - case NOOP: - break; - case EMPTY: - state->__state = ASCII; - *pwc = (wchar_t)0; - return 0; - case COPY_A: - state->__state = ASCII; - *pwc = (wchar_t)*ptr; - return (i + 1); - case COPY_J1: - state->__value.__wchb[0] = t[i]; - break; - case COPY_J2: - state->__state = JIS; - *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)(t[i]); - return (i + 1); - case MAKE_A: - ptr = (unsigned char *)(t + i + 1); - break; - case ERROR: - default: - r->_errno = EILSEQ; - return -1; - } + r->_errno = EILSEQ; + return -1; +} - } +/* Cygwin defines its own doublebyte charset conversion functions + because the underlying OS requires wchar_t == UTF-16. */ +#ifndef __CYGWIN__ +int +_DEFUN (__sjis_mbtowc, (r, pwc, s, n, charset, state), + struct _reent *r _AND + wchar_t *pwc _AND + const char *s _AND + size_t n _AND + const char *charset _AND + mbstate_t *state) +{ + wchar_t dummy; + unsigned char *t = (unsigned char *)s; + int ch; + int i = 0; - state->__state = curr_state; - return -2; /* n < bytes needed */ - } -#endif /* _MB_CAPABLE */ + if (pwc == NULL) + pwc = &dummy; - /* otherwise this must be the "C" locale or unknown locale */ if (s == NULL) return 0; /* not state-dependent */ + if (n == 0) + return -2; + + ch = t[i++]; + if (state->__count == 0) + { + if (_issjis1 (ch)) + { + state->__value.__wchb[0] = ch; + state->__count = 1; + if (n <= 1) + return -2; + ch = t[i++]; + } + } + if (state->__count == 1) + { + if (_issjis2 (ch)) + { + *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch; + state->__count = 0; + return i; + } + else + { + r->_errno = EILSEQ; + return -1; + } + } + + *pwc = (wchar_t)*t; + + if (*t == '\0') + return 0; + + return 1; +} + +int +_DEFUN (__eucjp_mbtowc, (r, pwc, s, n, charset, state), + struct _reent *r _AND + wchar_t *pwc _AND + const char *s _AND + size_t n _AND + const char *charset _AND + mbstate_t *state) +{ + wchar_t dummy; + unsigned char *t = (unsigned char *)s; + int ch; + int i = 0; + + if (pwc == NULL) + pwc = &dummy; + + if (s == NULL) + return 0; + + if (n == 0) + return -2; + + ch = t[i++]; + if (state->__count == 0) + { + if (_iseucjp (ch)) + { + state->__value.__wchb[0] = ch; + state->__count = 1; + if (n <= 1) + return -2; + ch = t[i++]; + } + } + if (state->__count == 1) + { + if (_iseucjp (ch)) + { + *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch; + state->__count = 0; + return i; + } + else + { + r->_errno = EILSEQ; + return -1; + } + } + *pwc = (wchar_t)*t; if (*t == '\0') @@ -421,3 +501,108 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state), return 1; } + +int +_DEFUN (__jis_mbtowc, (r, pwc, s, n, charset, state), + struct _reent *r _AND + wchar_t *pwc _AND + const char *s _AND + size_t n _AND + const char *charset _AND + mbstate_t *state) +{ + wchar_t dummy; + unsigned char *t = (unsigned char *)s; + JIS_STATE curr_state; + JIS_ACTION action; + JIS_CHAR_TYPE ch; + unsigned char *ptr; + unsigned int i; + int curr_ch; + + if (pwc == NULL) + pwc = &dummy; + + if (s == NULL) + { + state->__state = ASCII; + return 1; /* state-dependent */ + } + + if (n == 0) + return -2; + + curr_state = state->__state; + ptr = t; + + for (i = 0; i < n; ++i) + { + curr_ch = t[i]; + switch (curr_ch) + { + case ESC_CHAR: + ch = ESCAPE; + break; + case '$': + ch = DOLLAR; + break; + case '@': + ch = AT; + break; + case '(': + ch = BRACKET; + break; + case 'B': + ch = B; + break; + case 'J': + ch = J; + break; + case '\0': + ch = NUL; + break; + default: + if (_isjis (curr_ch)) + ch = JIS_CHAR; + else + ch = OTHER; + } + + action = JIS_action_table[curr_state][ch]; + curr_state = JIS_state_table[curr_state][ch]; + + switch (action) + { + case NOOP: + break; + case EMPTY: + state->__state = ASCII; + *pwc = (wchar_t)0; + return 0; + case COPY_A: + state->__state = ASCII; + *pwc = (wchar_t)*ptr; + return (i + 1); + case COPY_J1: + state->__value.__wchb[0] = t[i]; + break; + case COPY_J2: + state->__state = JIS; + *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)(t[i]); + return (i + 1); + case MAKE_A: + ptr = (unsigned char *)(t + i + 1); + break; + case ERROR: + default: + r->_errno = EILSEQ; + return -1; + } + + } + + state->__state = curr_state; + return -2; /* n < bytes needed */ +} +#endif /* !__CYGWIN__*/ +#endif /* _MB_CAPABLE */ |