Welcome to mirror list, hosted at ThFree Co, Russian Federation.

cygwin.com/git/newlib-cygwin.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCorinna Vinschen <corinna@vinschen.de>2009-10-03 12:51:07 +0400
committerCorinna Vinschen <corinna@vinschen.de>2009-10-03 12:51:07 +0400
commit6ff28fc3b12140dfa8539e0cb236a15b3eac089e (patch)
treed9d494d84dabeff6b8d93a21bd5dbcd5192345e8 /newlib/libc/stdlib
parent9c47bbb6e9bd8d95775a7887209d71f84d7e850c (diff)
* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
value encoding. * libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate value decoding.
Diffstat (limited to 'newlib/libc/stdlib')
-rw-r--r--newlib/libc/stdlib/mbtowc_r.c6
-rw-r--r--newlib/libc/stdlib/wctomb_r.c67
2 files changed, 35 insertions, 38 deletions
diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c
index a791692be..0fa57fafd 100644
--- a/newlib/libc/stdlib/mbtowc_r.c
+++ b/newlib/libc/stdlib/mbtowc_r.c
@@ -295,12 +295,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state),
tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
| (wchar_t)(ch & 0x3f);
- /* Check for invalid CESU-8 encoding of UTF-16 surrogate values. */
- if (tmp >= 0xd800 && tmp <= 0xdfff)
- {
- r->_errno = EILSEQ;
- return -1;
- }
*pwc = tmp;
return i;
}
diff --git a/newlib/libc/stdlib/wctomb_r.c b/newlib/libc/stdlib/wctomb_r.c
index 207221a7e..2c462ba63 100644
--- a/newlib/libc/stdlib/wctomb_r.c
+++ b/newlib/libc/stdlib/wctomb_r.c
@@ -63,72 +63,75 @@ _DEFUN (__utf8_wctomb, (r, s, wchar, charset, state),
mbstate_t *state)
{
wint_t wchar = _wchar;
+ int ret = 0;
if (s == NULL)
return 0; /* UTF-8 encoding is not state-dependent */
- if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
+ if (sizeof (wchar_t) == 2 && state->__count == -4
+ && (wchar < 0xdc00 || wchar >= 0xdfff))
{
- /* At this point only the second half of a surrogate pair is valid. */
- r->_errno = EILSEQ;
- return -1;
+ /* There's a leftover lone high surrogate. Write out the CESU-8 value
+ of the surrogate and proceed to convert the given character. Note
+ to return extra 3 bytes. */
+ wchar_t tmp;
+ tmp = (state->__value.__wchb[0] << 16 | state->__value.__wchb[1] << 8)
+ - 0x10000 >> 10 | 0xd80d;
+ *s++ = 0xe0 | ((tmp & 0xf000) >> 12);
+ *s++ = 0x80 | ((tmp & 0xfc0) >> 6);
+ *s++ = 0x80 | (tmp & 0x3f);
+ state->__count = 0;
+ ret = 3;
}
if (wchar <= 0x7f)
{
*s = wchar;
- return 1;
+ return ret + 1;
}
if (wchar >= 0x80 && wchar <= 0x7ff)
{
*s++ = 0xc0 | ((wchar & 0x7c0) >> 6);
*s = 0x80 | (wchar & 0x3f);
- return 2;
+ return ret + 2;
}
if (wchar >= 0x800 && wchar <= 0xffff)
{
- if (wchar >= 0xd800 && wchar <= 0xdfff)
+ /* No UTF-16 surrogate handling in UCS-4 */
+ if (sizeof (wchar_t) == 2 && wchar >= 0xd800 && wchar <= 0xdfff)
{
wint_t tmp;
- /* UTF-16 surrogates -- must not occur in normal UCS-4 data */
- if (sizeof (wchar_t) != 2)
+ if (wchar <= 0xdbff)
{
- r->_errno = EILSEQ;
- return -1;
+ /* First half of a surrogate pair. Store the state and
+ return ret + 0. */
+ tmp = ((wchar & 0x3ff) << 10) + 0x10000;
+ state->__value.__wchb[0] = (tmp >> 16) & 0xff;
+ state->__value.__wchb[1] = (tmp >> 8) & 0xff;
+ state->__count = -4;
+ *s = (0xf0 | ((tmp & 0x1c0000) >> 18));
+ return ret;
}
- if (wchar >= 0xdc00)
+ if (state->__count == -4)
{
- /* Second half of a surrogate pair. It's not valid if
- we don't have already read a first half of a surrogate
- before. */
- if (state->__count != -4)
- {
- r->_errno = EILSEQ;
- return -1;
- }
- /* If it's valid, reconstruct the full Unicode value and
- return the trailing three bytes of the UTF-8 char. */
+ /* Second half of a surrogate pair. Reconstruct the full
+ Unicode value and return the trailing three bytes of the
+ UTF-8 character. */
tmp = (state->__value.__wchb[0] << 16)
| (state->__value.__wchb[1] << 8)
| (wchar & 0x3ff);
state->__count = 0;
+ *s++ = 0xf0 | ((tmp & 0x1c0000) >> 18);
*s++ = 0x80 | ((tmp & 0x3f000) >> 12);
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
*s = 0x80 | (tmp & 0x3f);
- return 3;
+ return 4;
}
- /* First half of a surrogate pair. Store the state and return
- the first byte of the UTF-8 char. */
- tmp = ((wchar & 0x3ff) << 10) + 0x10000;
- state->__value.__wchb[0] = (tmp >> 16) & 0xff;
- state->__value.__wchb[1] = (tmp >> 8) & 0xff;
- state->__count = -4;
- *s = (0xf0 | ((tmp & 0x1c0000) >> 18));
- return 1;
+ /* Otherwise translate into CESU-8 value. */
}
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
*s = 0x80 | (wchar & 0x3f);
- return 3;
+ return ret + 3;
}
if (wchar >= 0x10000 && wchar <= 0x10ffff)
{