Welcome to mirror list, hosted at ThFree Co, Russian Federation.

cygwin.com/git/newlib-cygwin.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCorinna Vinschen <corinna@vinschen.de>2009-02-25 12:10:09 +0300
committerCorinna Vinschen <corinna@vinschen.de>2009-02-25 12:10:09 +0300
commit8d8bf5a5e292a2b436ad4d5dedd4595ecf15f9ee (patch)
treef1e9eafe5ee6277ac116cc680ff322d2ffe9366d /newlib/libc/stdlib
parent56eafaf6e399ac17343e845b4f6bbcf93be61f94 (diff)
* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8
sequences since they are invalid in the Unicode standard. Handle surrogate pairs in case of wchar_t == UTF-16. * wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t values beyond 0x10ffff into UTF-8 chars. Handle surrogate pairs in case of wchar_t == UTF-16.
Diffstat (limited to 'newlib/libc/stdlib')
-rw-r--r--newlib/libc/stdlib/mbtowc_r.c153
-rw-r--r--newlib/libc/stdlib/wctomb_r.c61
2 files changed, 70 insertions, 144 deletions
diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c
index 71bbf8537..00021beff 100644
--- a/newlib/libc/stdlib/mbtowc_r.c
+++ b/newlib/libc/stdlib/mbtowc_r.c
@@ -75,6 +75,18 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
if (s == NULL)
return 0; /* UTF-8 character encodings are not state-dependent */
+ if (state->__count == 4)
+ {
+ /* Create the second half of the surrogate pair. For a description
+ see the comment below. */
+ wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
+ | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
+ | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
+ | (wchar_t)(state->__value.__wchb[3] & 0x3f);
+ state->__count = 0;
+ *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff);
+ return 2;
+ }
if (state->__count == 0)
ch = t[i++];
else
@@ -153,8 +165,7 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
else if (ch >= 0xf0 && ch <= 0xf7)
{
/* four-byte sequence */
- if (sizeof(wchar_t) < 4)
- return -1; /* we can't store such a value */
+ wint_t tmp;
state->__value.__wchb[0] = ch;
if (state->__count == 0)
state->__count = 1;
@@ -185,125 +196,25 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
ch = t[i++];
if (ch < 0x80 || ch > 0xbf)
return -1;
- *pwc = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
- | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
- | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
- | (wchar_t)(ch & 0x3f);
-
- state->__count = 0;
- return i;
- }
- else if (ch >= 0xf8 && ch <= 0xfb)
- {
- /* five-byte sequence */
- if (sizeof(wchar_t) < 4)
- return -1; /* we can't store such a value */
- state->__value.__wchb[0] = ch;
- if (state->__count == 0)
- state->__count = 1;
- else if (n < (size_t)-1)
- ++n;
- if (n < 2)
- return -2;
- ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
- if (state->__value.__wchb[0] == 0xf8 && ch < 0x88)
- /* overlong UTF-8 sequence */
- return -1;
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[1] = ch;
- if (state->__count == 1)
- state->__count = 2;
- else if (n < (size_t)-1)
- ++n;
- if (n < 3)
- return -2;
- ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[2] = ch;
- if (state->__count == 2)
- state->__count = 3;
- else if (n < (size_t)-1)
- ++n;
- if (n < 4)
- return -2;
- ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[3] = ch;
- state->__count = 4;
- if (n < 5)
- return -2;
- ch = t[i++];
- *pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24)
- | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18)
- | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12)
- | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6)
- | (wchar_t)(ch & 0x3f);
-
- state->__count = 0;
- return i;
- }
- else if (ch >= 0xfc && ch <= 0xfd)
- {
- /* six-byte sequence */
- int ch2;
- if (sizeof(wchar_t) < 4)
- return -1; /* we can't store such a value */
- state->__value.__wchb[0] = ch;
- if (state->__count == 0)
- state->__count = 1;
- else if (n < (size_t)-1)
- ++n;
- if (n < 2)
- return -2;
- ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
- if (state->__value.__wchb[0] == 0xfc && ch < 0x84)
- /* overlong UTF-8 sequence */
- return -1;
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[1] = ch;
- if (state->__count == 1)
- state->__count = 2;
- else if (n < (size_t)-1)
- ++n;
- if (n < 3)
- return -2;
- ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[2] = ch;
- if (state->__count == 2)
- state->__count = 3;
- else if (n < (size_t)-1)
- ++n;
- if (n < 4)
- return -2;
- ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[3] = ch;
- if (state->__count == 3)
- state->__count = 4;
- else if (n < (size_t)-1)
- ++n;
- if (n < 5)
- return -2;
- if (n == 5)
- return -1; /* at this point we can't save enough to restart */
- ch = t[i++];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- ch2 = t[i++];
- *pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30)
- | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24)
- | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18)
- | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12)
- | (wchar_t)((ch & 0x3f) << 6)
- | (wchar_t)(ch2 & 0x3f);
-
+ tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
+ | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
+ | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
+ | (wint_t)(ch & 0x3f);
+ if (tmp > 0xffff && sizeof(wchar_t) == 2)
+ {
+ /* On systems which have wchar_t being UTF-16 values, the value
+ doesn't fit into a single wchar_t in this case. So what we
+ do here is to store the state with a special value of __count
+ and return the first half of a surrogate pair. As return
+ value we choose to return the half of the actual UTF-8 char.
+ The second half is returned in case we recognize the special
+ __count value above. */
+ state->__value.__wchb[3] = ch;
+ state->__count = 4;
+ *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff);
+ return 2;
+ }
+ *pwc = tmp;
state->__count = 0;
return i;
}
diff --git a/newlib/libc/stdlib/wctomb_r.c b/newlib/libc/stdlib/wctomb_r.c
index 82730424f..c96d954a1 100644
--- a/newlib/libc/stdlib/wctomb_r.c
+++ b/newlib/libc/stdlib/wctomb_r.c
@@ -28,6 +28,11 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
if (s == NULL)
return 0; /* UTF-8 encoding is not state-dependent */
+ if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
+ {
+ /* At this point only the second half of a surrogate pair is valid. */
+ return -1;
+ }
if (wchar <= 0x7f)
{
*s = wchar;
@@ -41,16 +46,45 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
}
else if (wchar >= 0x800 && wchar <= 0xffff)
{
- /* UTF-16 surrogates -- must not occur in normal UCS-4 data */
if (wchar >= 0xd800 && wchar <= 0xdfff)
- return -1;
-
+ {
+ wint_t tmp;
+ /* UTF-16 surrogates -- must not occur in normal UCS-4 data */
+ if (sizeof (wchar_t) != 2)
+ return -1;
+ if (wchar >= 0xdc00)
+ {
+ /* Second half of a surrogate pair. It's not valid if
+ we don't have already read a first half of a surrogate
+ before. */
+ if (state->__count != -4)
+ return -1;
+ /* If it's valid, reconstruct the full Unicode value and
+ return the trailing three bytes of the UTF-8 char. */
+ tmp = (state->__value.__wchb[0] << 16)
+ | (state->__value.__wchb[1] << 8)
+ | (wchar & 0x3ff);
+ state->__count = 0;
+ *s++ = 0x80 | ((tmp & 0x3f000) >> 12);
+ *s++ = 0x80 | ((tmp & 0xfc0) >> 6);
+ *s = 0x80 | (tmp & 0x3f);
+ return 3;
+ }
+ /* First half of a surrogate pair. Store the state and return
+ the first byte of the UTF-8 char. */
+ tmp = ((wchar & 0x3ff) << 10) + 0x10000;
+ state->__value.__wchb[0] = (tmp >> 16) & 0xff;
+ state->__value.__wchb[1] = (tmp >> 8) & 0xff;
+ state->__count = -4;
+ *s = (0xf0 | ((tmp & 0x1c0000) >> 18));
+ return 1;
+ }
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
*s = 0x80 | (wchar & 0x3f);
return 3;
}
- else if (wchar >= 0x10000 && wchar <= 0x1fffff)
+ else if (wchar >= 0x10000 && wchar <= 0x10ffff)
{
*s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
*s++ = 0x80 | ((wchar & 0x3f000) >> 12);
@@ -58,25 +92,6 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
*s = 0x80 | (wchar & 0x3f);
return 4;
}
- else if (wchar >= 0x200000 && wchar <= 0x3ffffff)
- {
- *s++ = 0xf8 | ((wchar & 0x3000000) >> 24);
- *s++ = 0x80 | ((wchar & 0xfc0000) >> 18);
- *s++ = 0x80 | ((wchar & 0x3f000) >> 12);
- *s++ = 0x80 | ((wchar & 0xfc0) >> 6);
- *s = 0x80 | (wchar & 0x3f);
- return 5;
- }
- else if (wchar >= 0x4000000 && wchar <= 0x7fffffff)
- {
- *s++ = 0xfc | ((wchar & 0x40000000) >> 30);
- *s++ = 0x80 | ((wchar & 0x3f000000) >> 24);
- *s++ = 0x80 | ((wchar & 0xfc0000) >> 18);
- *s++ = 0x80 | ((wchar & 0x3f000) >> 12);
- *s++ = 0x80 | ((wchar & 0xfc0) >> 6);
- *s = 0x80 | (wchar & 0x3f);
- return 6;
- }
else
return -1;
}