From 6ff28fc3b12140dfa8539e0cb236a15b3eac089e Mon Sep 17 00:00:00 2001
From: Corinna Vinschen <corinna@vinschen.de>
Date: Sat, 3 Oct 2009 08:51:07 +0000
Subject: 	* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8
 surrogate 	value encoding. 	* libc/stdlib/wctomb_r.c
 (__utf8_mbtowc): Allow CESU-8 surrogate 	value decoding.

---
 newlib/libc/stdlib/mbtowc_r.c |  6 ----
 newlib/libc/stdlib/wctomb_r.c | 67 ++++++++++++++++++++++---------------------
 2 files changed, 35 insertions(+), 38 deletions(-)

(limited to 'newlib/libc/stdlib')

diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c
index a791692be..0fa57fafd 100644
--- a/newlib/libc/stdlib/mbtowc_r.c
+++ b/newlib/libc/stdlib/mbtowc_r.c
@@ -295,12 +295,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state),
       tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
 	|    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
 	|     (wchar_t)(ch & 0x3f);
-      /* Check for invalid CESU-8 encoding of UTF-16 surrogate values. */
-      if (tmp >= 0xd800 && tmp <= 0xdfff)
-	{
-	  r->_errno = EILSEQ;
-	  return -1;
-	}
       *pwc = tmp;
       return i;
     }
diff --git a/newlib/libc/stdlib/wctomb_r.c b/newlib/libc/stdlib/wctomb_r.c
index 207221a7e..2c462ba63 100644
--- a/newlib/libc/stdlib/wctomb_r.c
+++ b/newlib/libc/stdlib/wctomb_r.c
@@ -63,72 +63,75 @@ _DEFUN (__utf8_wctomb, (r, s, wchar, charset, state),
         mbstate_t     *state)
 {
   wint_t wchar = _wchar;
+  int ret = 0;
 
   if (s == NULL)
     return 0; /* UTF-8 encoding is not state-dependent */
 
-  if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
+  if (sizeof (wchar_t) == 2 && state->__count == -4
+      && (wchar < 0xdc00 || wchar >= 0xdfff))
     {
-      /* At this point only the second half of a surrogate pair is valid. */
-      r->_errno = EILSEQ;
-      return -1;
+      /* There's a leftover lone high surrogate.  Write out the CESU-8 value
+	 of the surrogate and proceed to convert the given character.  Note
+	 to return extra 3 bytes. */
+      wchar_t tmp;
+      tmp = (state->__value.__wchb[0] << 16 | state->__value.__wchb[1] << 8)
+	    - 0x10000 >> 10 | 0xd80d;
+      *s++ = 0xe0 | ((tmp & 0xf000) >> 12);
+      *s++ = 0x80 | ((tmp &  0xfc0) >> 6);
+      *s++ = 0x80 |  (tmp &   0x3f);
+      state->__count = 0;
+      ret = 3;
     }
   if (wchar <= 0x7f)
     {
       *s = wchar;
-      return 1;
+      return ret + 1;
     }
   if (wchar >= 0x80 && wchar <= 0x7ff)
     {
       *s++ = 0xc0 | ((wchar & 0x7c0) >> 6);
       *s   = 0x80 |  (wchar &  0x3f);
-      return 2;
+      return ret + 2;
     }
   if (wchar >= 0x800 && wchar <= 0xffff)
     {
-      if (wchar >= 0xd800 && wchar <= 0xdfff)
+      /* No UTF-16 surrogate handling in UCS-4 */
+      if (sizeof (wchar_t) == 2 && wchar >= 0xd800 && wchar <= 0xdfff)
 	{
 	  wint_t tmp;
-	  /* UTF-16 surrogates -- must not occur in normal UCS-4 data */
-	  if (sizeof (wchar_t) != 2)
+	  if (wchar <= 0xdbff)
 	    {
-	      r->_errno = EILSEQ;
-	      return -1;
+	      /* First half of a surrogate pair.  Store the state and
+	         return ret + 0. */
+	      tmp = ((wchar & 0x3ff) << 10) + 0x10000;
+	      state->__value.__wchb[0] = (tmp >> 16) & 0xff;
+	      state->__value.__wchb[1] = (tmp >> 8) & 0xff;
+	      state->__count = -4;
+	      *s = (0xf0 | ((tmp & 0x1c0000) >> 18));
+	      return ret;
 	    }
-	  if (wchar >= 0xdc00)
+	  if (state->__count == -4)
 	    {
-	      /* Second half of a surrogate pair. It's not valid if
-		 we don't have already read a first half of a surrogate
-		 before. */
-	      if (state->__count != -4)
-		{
-		  r->_errno = EILSEQ;
-		  return -1;
-		}
-	      /* If it's valid, reconstruct the full Unicode value and
-		 return the trailing three bytes of the UTF-8 char. */
+	      /* Second half of a surrogate pair.  Reconstruct the full
+		 Unicode value and return the trailing three bytes of the
+		 UTF-8 character. */
 	      tmp = (state->__value.__wchb[0] << 16)
 		    | (state->__value.__wchb[1] << 8)
 		    | (wchar & 0x3ff);
 	      state->__count = 0;
+	      *s++ = 0xf0 | ((tmp & 0x1c0000) >> 18);
 	      *s++ = 0x80 | ((tmp &  0x3f000) >> 12);
 	      *s++ = 0x80 | ((tmp &    0xfc0) >> 6);
 	      *s   = 0x80 |  (tmp &     0x3f);
-	      return 3;
+	      return 4;
 	    }
-	  /* First half of a surrogate pair.  Store the state and return
-	     the first byte of the UTF-8 char. */
-	  tmp = ((wchar & 0x3ff) << 10) + 0x10000;
-	  state->__value.__wchb[0] = (tmp >> 16) & 0xff;
-	  state->__value.__wchb[1] = (tmp >> 8) & 0xff;
-	  state->__count = -4;
-	  *s = (0xf0 | ((tmp & 0x1c0000) >> 18));
-	  return 1;
+	  /* Otherwise translate into CESU-8 value. */
 	}
       *s++ = 0xe0 | ((wchar & 0xf000) >> 12);
       *s++ = 0x80 | ((wchar &  0xfc0) >> 6);
       *s   = 0x80 |  (wchar &   0x3f);
-      return 3;
+      return ret + 3;
     }
   if (wchar >= 0x10000 && wchar <= 0x10ffff)
     {
-- 
cgit v1.2.3