diff options
author | Corinna Vinschen <corinna@vinschen.de> | 2009-03-24 10:13:27 +0000 |
---|---|---|
committer | Corinna Vinschen <corinna@vinschen.de> | 2009-03-24 10:13:27 +0000 |
commit | 28186e81d947a830d9895cecc2d8e836a3cbccd0 (patch) | |
tree | 8ef79212f3476f0b1fbbef8fa46b97ea297c6fe3 /newlib/libc/stdlib/wctomb_r.c | |
parent | 0258b687228f0d9d5191615ba0a13f7496f09d3b (diff) | |
download | cygnal-28186e81d947a830d9895cecc2d8e836a3cbccd0.tar.gz cygnal-28186e81d947a830d9895cecc2d8e836a3cbccd0.tar.bz2 cygnal-28186e81d947a830d9895cecc2d8e836a3cbccd0.zip |
* libc/ctype/iswalpha.c: Handle all wchar_t as unicode on
_MB_CAPABLE systems.
* libc/ctype/iswblank.c: Ditto.
* libc/ctype/iswcntrl.c: Ditto.
* libc/ctype/iswprint.c: Ditto.
* libc/ctype/iswpunct.c: Ditto.
* libc/ctype/iswspace.c: Ditto.
* libc/ctype/jp2uc.c (__jp2uc): On Cygwin, just return c.
Explain why.
* libc/ctype/towlower.c: Ditto.
* libc/ctype/towupper.c: Ditto.
* libc/include/sys/config.h: Define _MB_EXTENDED_CHARSETS_ISO
and _MB_EXTENDED_CHARSETS_WINDOWS if _MB_EXTENDED_CHARSETS_ALL is
defined. Define _MB_EXTENDED_CHARSETS_ALL on Cygwin only for now.
* libc/include/sys/reent.h (struct _reent): Mark _current_category
and _current_locale as unused.
* libc/locale/locale.c: Add new charset support to documentation.
Include ../stdio/local.h from here.
(lc_ctype_charset): Set to "ASCII" by default.
(lc_message_charset): Ditto.
(_setlocale_r): Don't set _current_category and _current_locale.
(loadlocale): Add Cygwin codepage support. On _MB_CAPABLE
systems, set __mbtowc and __wctomb function pointers to function
corresponding with current charset. Don't allow non-existant
ISO-8859-12 charset. Add support for Windows singlebyte codepages.
On Cygwin, add support for GBK, CP949, and BIG5. On Cygwin,
call __set_ctype() in case the catorgy is LC_CTYPE. Don't set
_current_category and _current_locale.
* libc/stdlib/Makefile.am (GENERAL_SOURCES): Add sb_charsets.c.
* libc/stdlib/Makefile.in: Regenerate.
* libc/stdlib/local.h: Add prototype for __locale_charset.
Add prototypes for __mbtowc and __wctomb pointers.
Add prototypes for charset-specific _wctomb_r and _mbtowc_r
functions.
Declare tables and functions from sb_charsets.c.
* libc/stdlib/mbtowc_r.c (__mbtowc): Define. Set to __ascii_mbtowc
by default.
(_mbtowc_r): Just call __mbtowc from here.
(__ascii_mbtowc): New function.
(__iso_mbtowc): New function.
(__cp_mbtowc): New function.
(__utf8_mbtowc): New function.
(__sjis_mbtowc): New function. Disable on Cygwin.
(__eucjp_mbtowc): New function. Disable on Cygwin.
(__jis_mbtowc): New function. Disable on Cygwin.
* libc/stdlib/sb_charsets.c: New file, adding singlebyte to UTF
conversion tables for all ISO and CP charsets.
(__iso_8859_index): New function.
(__cp_index): New function.
* libc/stdlib/wctomb_r.c (__wctomb): Define. Set to __ascii_wctomb
by default.
(_wctomb_r): Just call __wctomb from here.
(__ascii_wctomb): New function.
(__utf8_wctomb): New function.
(__sjis_wctomb): New function. Disable on Cygwin.
(__eucjp_wctomb): New function. Disable on Cygwin.
(__jis_wctomb): New function. Disable on Cygwin.
(__iso_wctomb): New function.
(__cp_wctomb): New function.
Diffstat (limited to 'newlib/libc/stdlib/wctomb_r.c')
-rw-r--r-- | newlib/libc/stdlib/wctomb_r.c | 464 |
1 files changed, 301 insertions, 163 deletions
diff --git a/newlib/libc/stdlib/wctomb_r.c b/newlib/libc/stdlib/wctomb_r.c index 8d6d3fc92..64210f232 100644 --- a/newlib/libc/stdlib/wctomb_r.c +++ b/newlib/libc/stdlib/wctomb_r.c @@ -4,11 +4,11 @@ #include <wchar.h> #include <locale.h> #include "mbctype.h" +#include "local.h" -extern char *__locale_charset (); - -/* for some conversions, we use the __count field as a place to store a state value */ -#define __state __count +int (*__wctomb) (struct _reent *, char *, wchar_t, const char *charset, + mbstate_t *) + = __ascii_wctomb; int _DEFUN (_wctomb_r, (r, s, wchar, state), @@ -17,196 +17,287 @@ _DEFUN (_wctomb_r, (r, s, wchar, state), wchar_t _wchar _AND mbstate_t *state) { + return __wctomb (r, s, _wchar, __locale_charset (), state); +} + +int +_DEFUN (__ascii_wctomb, (r, s, wchar, charset, state), + struct _reent *r _AND + char *s _AND + wchar_t _wchar _AND + const char *charset _AND + mbstate_t *state) +{ /* Avoids compiler warnings about comparisons that are always false due to limited range when sizeof(wchar_t) is 2 but sizeof(wint_t) is 4, as is the case on cygwin. */ wint_t wchar = _wchar; - if (strlen (__locale_charset ()) <= 1) - { /* fall-through */ } - else if (!strcmp (__locale_charset (), "UTF-8")) + if (s == NULL) + return 0; + + if ((size_t)wchar >= 0x100) { - if (s == NULL) - return 0; /* UTF-8 encoding is not state-dependent */ + r->_errno = EILSEQ; + return -1; + } + + *s = (char) wchar; + return 1; +} - if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff)) +#ifdef _MB_CAPABLE +/* for some conversions, we use the __count field as a place to store a state value */ +#define __state __count + +int +_DEFUN (__utf8_wctomb, (r, s, wchar, charset, state), + struct _reent *r _AND + char *s _AND + wchar_t _wchar _AND + const char *charset _AND + mbstate_t *state) +{ + wint_t wchar = _wchar; + + if (s == NULL) + return 0; /* UTF-8 encoding is not state-dependent */ + + if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff)) + { + /* At this point only the second half of a surrogate pair is valid. */ + r->_errno = EILSEQ; + return -1; + } + if (wchar <= 0x7f) + { + *s = wchar; + return 1; + } + if (wchar >= 0x80 && wchar <= 0x7ff) + { + *s++ = 0xc0 | ((wchar & 0x7c0) >> 6); + *s = 0x80 | (wchar & 0x3f); + return 2; + } + if (wchar >= 0x800 && wchar <= 0xffff) + { + if (wchar >= 0xd800 && wchar <= 0xdfff) { - /* At this point only the second half of a surrogate pair is valid. */ - r->_errno = EILSEQ; - return -1; - } - if (wchar <= 0x7f) - { - *s = wchar; - return 1; - } - else if (wchar >= 0x80 && wchar <= 0x7ff) - { - *s++ = 0xc0 | ((wchar & 0x7c0) >> 6); - *s = 0x80 | (wchar & 0x3f); - return 2; - } - else if (wchar >= 0x800 && wchar <= 0xffff) - { - if (wchar >= 0xd800 && wchar <= 0xdfff) + wint_t tmp; + /* UTF-16 surrogates -- must not occur in normal UCS-4 data */ + if (sizeof (wchar_t) != 2) + { + r->_errno = EILSEQ; + return -1; + } + if (wchar >= 0xdc00) { - wint_t tmp; - /* UTF-16 surrogates -- must not occur in normal UCS-4 data */ - if (sizeof (wchar_t) != 2) + /* Second half of a surrogate pair. It's not valid if + we don't have already read a first half of a surrogate + before. */ + if (state->__count != -4) { r->_errno = EILSEQ; return -1; } - if (wchar >= 0xdc00) - { - /* Second half of a surrogate pair. It's not valid if - we don't have already read a first half of a surrogate - before. */ - if (state->__count != -4) - { - r->_errno = EILSEQ; - return -1; - } - /* If it's valid, reconstruct the full Unicode value and - return the trailing three bytes of the UTF-8 char. */ - tmp = (state->__value.__wchb[0] << 16) - | (state->__value.__wchb[1] << 8) - | (wchar & 0x3ff); - state->__count = 0; - *s++ = 0x80 | ((tmp & 0x3f000) >> 12); - *s++ = 0x80 | ((tmp & 0xfc0) >> 6); - *s = 0x80 | (tmp & 0x3f); - return 3; - } - /* First half of a surrogate pair. Store the state and return - the first byte of the UTF-8 char. */ - tmp = ((wchar & 0x3ff) << 10) + 0x10000; - state->__value.__wchb[0] = (tmp >> 16) & 0xff; - state->__value.__wchb[1] = (tmp >> 8) & 0xff; - state->__count = -4; - *s = (0xf0 | ((tmp & 0x1c0000) >> 18)); - return 1; + /* If it's valid, reconstruct the full Unicode value and + return the trailing three bytes of the UTF-8 char. */ + tmp = (state->__value.__wchb[0] << 16) + | (state->__value.__wchb[1] << 8) + | (wchar & 0x3ff); + state->__count = 0; + *s++ = 0x80 | ((tmp & 0x3f000) >> 12); + *s++ = 0x80 | ((tmp & 0xfc0) >> 6); + *s = 0x80 | (tmp & 0x3f); + return 3; } - *s++ = 0xe0 | ((wchar & 0xf000) >> 12); - *s++ = 0x80 | ((wchar & 0xfc0) >> 6); - *s = 0x80 | (wchar & 0x3f); - return 3; - } - else if (wchar >= 0x10000 && wchar <= 0x10ffff) - { - *s++ = 0xf0 | ((wchar & 0x1c0000) >> 18); - *s++ = 0x80 | ((wchar & 0x3f000) >> 12); - *s++ = 0x80 | ((wchar & 0xfc0) >> 6); - *s = 0x80 | (wchar & 0x3f); - return 4; - } + /* First half of a surrogate pair. Store the state and return + the first byte of the UTF-8 char. */ + tmp = ((wchar & 0x3ff) << 10) + 0x10000; + state->__value.__wchb[0] = (tmp >> 16) & 0xff; + state->__value.__wchb[1] = (tmp >> 8) & 0xff; + state->__count = -4; + *s = (0xf0 | ((tmp & 0x1c0000) >> 18)); + return 1; + } + *s++ = 0xe0 | ((wchar & 0xf000) >> 12); + *s++ = 0x80 | ((wchar & 0xfc0) >> 6); + *s = 0x80 | (wchar & 0x3f); + return 3; + } + if (wchar >= 0x10000 && wchar <= 0x10ffff) + { + *s++ = 0xf0 | ((wchar & 0x1c0000) >> 18); + *s++ = 0x80 | ((wchar & 0x3f000) >> 12); + *s++ = 0x80 | ((wchar & 0xfc0) >> 6); + *s = 0x80 | (wchar & 0x3f); + return 4; + } + + r->_errno = EILSEQ; + return -1; +} + +/* Cygwin defines its own doublebyte charset conversion functions + because the underlying OS requires wchar_t == UTF-16. */ +#ifndef __CYGWIN__ +int +_DEFUN (__sjis_wctomb, (r, s, wchar, charset, state), + struct _reent *r _AND + char *s _AND + wchar_t _wchar _AND + const char *charset _AND + mbstate_t *state) +{ + wint_t wchar = _wchar; + + unsigned char char2 = (unsigned char)wchar; + unsigned char char1 = (unsigned char)(wchar >> 8); + + if (s == NULL) + return 0; /* not state-dependent */ + + if (char1 != 0x00) + { + /* first byte is non-zero..validate multi-byte char */ + if (_issjis1(char1) && _issjis2(char2)) + { + *s++ = (char)char1; + *s = (char)char2; + return 2; + } else { r->_errno = EILSEQ; return -1; } } - else if (!strcmp (__locale_charset (), "SJIS")) + *s = (char) wchar; + return 1; +} + +int +_DEFUN (__eucjp_wctomb, (r, s, wchar, charset, state), + struct _reent *r _AND + char *s _AND + wchar_t _wchar _AND + const char *charset _AND + mbstate_t *state) +{ + wint_t wchar = _wchar; + unsigned char char2 = (unsigned char)wchar; + unsigned char char1 = (unsigned char)(wchar >> 8); + + if (s == NULL) + return 0; /* not state-dependent */ + + if (char1 != 0x00) { - unsigned char char2 = (unsigned char)wchar; - unsigned char char1 = (unsigned char)(wchar >> 8); - - if (s == NULL) - return 0; /* not state-dependent */ - - if (char1 != 0x00) - { - /* first byte is non-zero..validate multi-byte char */ - if (_issjis1(char1) && _issjis2(char2)) - { - *s++ = (char)char1; - *s = (char)char2; - return 2; - } - else - { - r->_errno = EILSEQ; - return -1; - } - } + /* first byte is non-zero..validate multi-byte char */ + if (_iseucjp (char1) && _iseucjp (char2)) + { + *s++ = (char)char1; + *s = (char)char2; + return 2; + } + else + { + r->_errno = EILSEQ; + return -1; + } } - else if (!strcmp (__locale_charset (), "EUCJP")) + *s = (char) wchar; + return 1; +} + +int +_DEFUN (__jis_wctomb, (r, s, wchar, charset, state), + struct _reent *r _AND + char *s _AND + wchar_t _wchar _AND + const char *charset _AND + mbstate_t *state) +{ + wint_t wchar = _wchar; + int cnt = 0; + unsigned char char2 = (unsigned char)wchar; + unsigned char char1 = (unsigned char)(wchar >> 8); + + if (s == NULL) + return 1; /* state-dependent */ + + if (char1 != 0x00) { - unsigned char char2 = (unsigned char)wchar; - unsigned char char1 = (unsigned char)(wchar >> 8); - - if (s == NULL) - return 0; /* not state-dependent */ - - if (char1 != 0x00) - { - /* first byte is non-zero..validate multi-byte char */ - if (_iseucjp (char1) && _iseucjp (char2)) - { - *s++ = (char)char1; - *s = (char)char2; - return 2; - } - else + /* first byte is non-zero..validate multi-byte char */ + if (_isjis (char1) && _isjis (char2)) + { + if (state->__state == 0) { - r->_errno = EILSEQ; - return -1; + /* must switch from ASCII to JIS state */ + state->__state = 1; + *s++ = ESC_CHAR; + *s++ = '$'; + *s++ = 'B'; + cnt = 3; } - } + *s++ = (char)char1; + *s = (char)char2; + return cnt + 2; + } + r->_errno = EILSEQ; + return -1; } - else if (!strcmp (__locale_charset (), "JIS")) + if (state->__state != 0) { - int cnt = 0; - unsigned char char2 = (unsigned char)wchar; - unsigned char char1 = (unsigned char)(wchar >> 8); - - if (s == NULL) - return 1; /* state-dependent */ - - if (char1 != 0x00) - { - /* first byte is non-zero..validate multi-byte char */ - if (_isjis (char1) && _isjis (char2)) - { - if (state->__state == 0) - { - /* must switch from ASCII to JIS state */ - state->__state = 1; - *s++ = ESC_CHAR; - *s++ = '$'; - *s++ = 'B'; - cnt = 3; - } - *s++ = (char)char1; - *s = (char)char2; - return cnt + 2; - } - else - { - r->_errno = EILSEQ; - return -1; - } - } - else - { - if (state->__state != 0) - { - /* must switch from JIS to ASCII state */ - state->__state = 0; - *s++ = ESC_CHAR; - *s++ = '('; - *s++ = 'B'; - cnt = 3; - } - *s = (char)char2; - return cnt + 1; - } + /* must switch from JIS to ASCII state */ + state->__state = 0; + *s++ = ESC_CHAR; + *s++ = '('; + *s++ = 'B'; + cnt = 3; } + *s = (char)char2; + return cnt + 1; +} +#endif /* !__CYGWIN__ */ + +#ifdef _MB_EXTENDED_CHARSETS_ISO +int +_DEFUN (__iso_wctomb, (r, s, wchar, charset, state), + struct _reent *r _AND + char *s _AND + wchar_t _wchar _AND + const char *charset _AND + mbstate_t *state) +{ + wint_t wchar = _wchar; if (s == NULL) return 0; + + /* wchars <= 0x9f translate to all ISO charsets directly. */ + if (wchar >= 0xa0) + { + int iso_idx = __iso_8859_index (charset + 9); + if (iso_idx >= 0) + { + unsigned char mb; + + if (s == NULL) + return 0; + + for (mb = 0; mb < 0x60; ++mb) + if (__iso_8859_conv[iso_idx][mb] == wchar) + { + *s = (char) (mb + 0xa0); + return 1; + } + r->_errno = EILSEQ; + return -1; + } + } - /* otherwise we are dealing with a single byte character */ if ((size_t)wchar >= 0x100) { r->_errno = EILSEQ; @@ -216,4 +307,51 @@ _DEFUN (_wctomb_r, (r, s, wchar, state), *s = (char) wchar; return 1; } - +#endif /* _MB_EXTENDED_CHARSETS_ISO */ + +#ifdef _MB_EXTENDED_CHARSETS_WINDOWS +int +_DEFUN (__cp_wctomb, (r, s, wchar, charset, state), + struct _reent *r _AND + char *s _AND + wchar_t _wchar _AND + const char *charset _AND + mbstate_t *state) +{ + wint_t wchar = _wchar; + + if (s == NULL) + return 0; + + if (wchar >= 0x80) + { + int cp_idx = __cp_index (charset + 2); + if (cp_idx >= 0) + { + unsigned char mb; + + if (s == NULL) + return 0; + + for (mb = 0; mb < 0x80; ++mb) + if (__cp_conv[cp_idx][mb] == wchar) + { + *s = (char) (mb + 0x80); + return 1; + } + r->_errno = EILSEQ; + return -1; + } + } + + if ((size_t)wchar >= 0x100) + { + r->_errno = EILSEQ; + return -1; + } + + *s = (char) wchar; + return 1; +} +#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */ +#endif /* _MB_CAPABLE */ |