From 6ff28fc3b12140dfa8539e0cb236a15b3eac089e Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Sat, 3 Oct 2009 08:51:07 +0000 Subject: * libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate value encoding. * libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate value decoding. --- newlib/libc/stdlib/mbtowc_r.c | 6 ---- newlib/libc/stdlib/wctomb_r.c | 67 ++++++++++++++++++++++--------------------- 2 files changed, 35 insertions(+), 38 deletions(-) (limited to 'newlib/libc/stdlib') diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c index a791692be..0fa57fafd 100644 --- a/newlib/libc/stdlib/mbtowc_r.c +++ b/newlib/libc/stdlib/mbtowc_r.c @@ -295,12 +295,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state), tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12) | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6) | (wchar_t)(ch & 0x3f); - /* Check for invalid CESU-8 encoding of UTF-16 surrogate values. */ - if (tmp >= 0xd800 && tmp <= 0xdfff) - { - r->_errno = EILSEQ; - return -1; - } *pwc = tmp; return i; } diff --git a/newlib/libc/stdlib/wctomb_r.c b/newlib/libc/stdlib/wctomb_r.c index 207221a7e..2c462ba63 100644 --- a/newlib/libc/stdlib/wctomb_r.c +++ b/newlib/libc/stdlib/wctomb_r.c @@ -63,72 +63,75 @@ _DEFUN (__utf8_wctomb, (r, s, wchar, charset, state), mbstate_t *state) { wint_t wchar = _wchar; + int ret = 0; if (s == NULL) return 0; /* UTF-8 encoding is not state-dependent */ - if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff)) + if (sizeof (wchar_t) == 2 && state->__count == -4 + && (wchar < 0xdc00 || wchar >= 0xdfff)) { - /* At this point only the second half of a surrogate pair is valid. */ - r->_errno = EILSEQ; - return -1; + /* There's a leftover lone high surrogate. Write out the CESU-8 value + of the surrogate and proceed to convert the given character. Note + to return extra 3 bytes. */ + wchar_t tmp; + tmp = (state->__value.__wchb[0] << 16 | state->__value.__wchb[1] << 8) + - 0x10000 >> 10 | 0xd80d; + *s++ = 0xe0 | ((tmp & 0xf000) >> 12); + *s++ = 0x80 | ((tmp & 0xfc0) >> 6); + *s++ = 0x80 | (tmp & 0x3f); + state->__count = 0; + ret = 3; } if (wchar <= 0x7f) { *s = wchar; - return 1; + return ret + 1; } if (wchar >= 0x80 && wchar <= 0x7ff) { *s++ = 0xc0 | ((wchar & 0x7c0) >> 6); *s = 0x80 | (wchar & 0x3f); - return 2; + return ret + 2; } if (wchar >= 0x800 && wchar <= 0xffff) { - if (wchar >= 0xd800 && wchar <= 0xdfff) + /* No UTF-16 surrogate handling in UCS-4 */ + if (sizeof (wchar_t) == 2 && wchar >= 0xd800 && wchar <= 0xdfff) { wint_t tmp; - /* UTF-16 surrogates -- must not occur in normal UCS-4 data */ - if (sizeof (wchar_t) != 2) + if (wchar <= 0xdbff) { - r->_errno = EILSEQ; - return -1; + /* First half of a surrogate pair. Store the state and + return ret + 0. */ + tmp = ((wchar & 0x3ff) << 10) + 0x10000; + state->__value.__wchb[0] = (tmp >> 16) & 0xff; + state->__value.__wchb[1] = (tmp >> 8) & 0xff; + state->__count = -4; + *s = (0xf0 | ((tmp & 0x1c0000) >> 18)); + return ret; } - if (wchar >= 0xdc00) + if (state->__count == -4) { - /* Second half of a surrogate pair. It's not valid if - we don't have already read a first half of a surrogate - before. */ - if (state->__count != -4) - { - r->_errno = EILSEQ; - return -1; - } - /* If it's valid, reconstruct the full Unicode value and - return the trailing three bytes of the UTF-8 char. */ + /* Second half of a surrogate pair. Reconstruct the full + Unicode value and return the trailing three bytes of the + UTF-8 character. */ tmp = (state->__value.__wchb[0] << 16) | (state->__value.__wchb[1] << 8) | (wchar & 0x3ff); state->__count = 0; + *s++ = 0xf0 | ((tmp & 0x1c0000) >> 18); *s++ = 0x80 | ((tmp & 0x3f000) >> 12); *s++ = 0x80 | ((tmp & 0xfc0) >> 6); *s = 0x80 | (tmp & 0x3f); - return 3; + return 4; } - /* First half of a surrogate pair. Store the state and return - the first byte of the UTF-8 char. */ - tmp = ((wchar & 0x3ff) << 10) + 0x10000; - state->__value.__wchb[0] = (tmp >> 16) & 0xff; - state->__value.__wchb[1] = (tmp >> 8) & 0xff; - state->__count = -4; - *s = (0xf0 | ((tmp & 0x1c0000) >> 18)); - return 1; + /* Otherwise translate into CESU-8 value. */ } *s++ = 0xe0 | ((wchar & 0xf000) >> 12); *s++ = 0x80 | ((wchar & 0xfc0) >> 6); *s = 0x80 | (wchar & 0x3f); - return 3; + return ret + 3; } if (wchar >= 0x10000 && wchar <= 0x10ffff) { -- cgit v1.2.3