2006-10-30 Atsushi Enomoto <atsushi@ximian.com>

* src/gutf8.c : several fixes: - fixed incorrect mb_size clear and mb_remain computation. - initialize items_written to 0 for error case. - in utf8_to_utf16_len differentiate error messages completely. - in g_utf8_to_utf16 use guchar instead of gchar. * test/utf8.c : added test case string in test_utf8_seq() to both test_utf8_to_utf16() and test_utf16_to_utf8(). svn path=/trunk/mono/; revision=67173
author: Atsushi Eno <atsushieno@gmail.com> 2006-10-31 07:01:04 +0300
committer: Atsushi Eno <atsushieno@gmail.com> 2006-10-31 07:01:04 +0300
commit: 0afb42d28e8817715b3eadeec95c41591a5dcd74 (patch)
tree: 91d1affe8919cc1134ebecd9f357a45dfae6bb83 /eglib
parent: 9388e8a295f8f86d35e31f824d95acb0fe17fd60 (diff)
3 files changed, 59 insertions, 27 deletions
diff --git a/eglib/ChangeLog b/eglib/ChangeLog
index 71bb3890f1c..5e246afbad7 100644
--- a/eglib/ChangeLog
+++ b/eglib/ChangeLog
@@ -1,3 +1,13 @@
+2006-10-30  Atsushi Enomoto  <atsushi@ximian.com>
+
+	* src/gutf8.c : several fixes:
+	  - fixed incorrect mb_size clear and mb_remain computation.
+	  - initialize items_written to 0 for error case.
+	  - in utf8_to_utf16_len differentiate error messages completely.
+	  - in g_utf8_to_utf16 use guchar instead of gchar.
+	* test/utf8.c : added test case string in test_utf8_seq() to
+	  both test_utf8_to_utf16() and test_utf16_to_utf8().
+
 2006-10-21  Miguel de Icaza  <miguel@novell.com>
 
 	* src/gunicode.c (g_filename_from_utf8): Use g_strlcpy here. 
diff --git a/eglib/src/gutf8.c b/eglib/src/gutf8.c
index 6239178b04b..77f96bedbb6 100644
--- a/eglib/src/gutf8.c
+++ b/eglib/src/gutf8.c
@@ -29,7 +29,7 @@ g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_wr
 	   the conversion core below simply resets erroreous bits */
 	glong utf16_len;
 	gunichar2 *ret;
-	gchar ch, mb_size, mb_remain;
+	guchar ch, mb_size, mb_remain;
 	guint32 codepoint;
 	glong in_pos, out_pos;
 
@@ -42,6 +42,8 @@ g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_wr
 	if (error)
 		*error = NULL;
 
+	if (items_written)
+		*items_written = 0;
 	utf16_len = utf8_to_utf16_len (str, len, items_read, error);
 	if (error)
 		if (*error)
@@ -54,35 +56,37 @@ g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_wr
 	for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
 		ch = (guchar) str [in_pos];
 		if (mb_size == 0) {
-			if (0 < ch)
+			if (ch < 0x80)
 				ret [out_pos++] = ch;
 			else if ((ch & 0xE0) == 0xC0) {
 				codepoint = ch & 0x1F;
-				mb_remain = mb_size = 2;
+				mb_size = 2;
 			} else if ((ch & 0xF0) == 0xE0) {
 				codepoint = ch & 0x0F;
-				mb_remain = mb_size = 3;
+				mb_size = 3;
 			} else if ((ch & 0xF8) == 0xF0) {
 				codepoint = ch & 7;
-				mb_remain = mb_size = 4;
+				mb_size = 4;
 			} else if ((ch & 0xFC) == 0xF8) {
 				codepoint = ch & 3;
-				mb_remain = mb_size = 5;
+				mb_size = 5;
 			} else if ((ch & 0xFE) == 0xFC) {
 				codepoint = ch & 3;
-				mb_remain = mb_size = 6;
+				mb_size = 6;
 			} else {
 				/* invalid utf-8 sequence */
 				codepoint = 0;
 				mb_remain = mb_size = 0;
 			}
+			if (mb_size > 1)
+				mb_remain = mb_size - 1;
 		} else {
 			if ((ch & 0xC0) == 0x80) {
 				codepoint = (codepoint << 6) | (ch & 0x3F);
 				if (--mb_remain == 0) {
 					/* multi byte character is fully consumed now. */
 					if (codepoint < 0x10000) {
-						ret [out_pos++] = codepoint;
+						ret [out_pos++] = codepoint % 0x10000;
 					} else if (codepoint < 0x110000) {
 						/* surrogate pair */
 						codepoint -= 0x10000;
@@ -91,8 +95,9 @@ g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_wr
 					} else {
 						/* invalid utf-8 sequence (excess) */
 						codepoint = 0;
-						mb_remain = mb_size = 0;
+						mb_remain = 0;
 					}
+					mb_size = 0;
 				}
 			} else {
 				/* invalid utf-8 sequence */
@@ -130,23 +135,23 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
 				ret++;
 			else if ((ch & 0xE0) == 0xC0) {
 				codepoint = ch & 0x1F;
-				mb_remain = mb_size = 2;
+				mb_size = 2;
 			} else if ((ch & 0xF0) == 0xE0) {
 				codepoint = ch & 0x0F;
-				mb_remain = mb_size = 3;
+				mb_size = 3;
 			} else if ((ch & 0xF8) == 0xF0) {
 				codepoint = ch & 7;
-				mb_remain = mb_size = 4;
+				mb_size = 4;
 			} else if ((ch & 0xFC) == 0xF8) {
 				codepoint = ch & 3;
-				mb_remain = mb_size = 5;
+				mb_size = 5;
 			} else if ((ch & 0xFE) == 0xFC) {
 				codepoint = ch & 3;
-				mb_remain = mb_size = 6;
+				mb_size = 6;
 			} else {
 				/* invalid utf-8 sequence */
 				if (error) {
-					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
+					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
 					if (items_read)
 						*items_read = in_pos;
 					return -1;
@@ -155,6 +160,8 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
 					mb_remain = mb_size = 0;
 				}
 			}
+			if (mb_size > 1)
+				mb_remain = mb_size - 1;
 		} else {
 			if ((ch & 0xC0) == 0x80) {
 				codepoint = (codepoint << 6) | (ch & 0x3F);
@@ -187,7 +194,7 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
 								return -1;
 							} else {
 								codepoint = 0;
-								mb_remain = mb_size = 0;
+								mb_remain = 0;
 								overlong = FALSE;
 							}
 						}
@@ -205,14 +212,15 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
 							return -1;
 						} else {
 							codepoint = 0;
-							mb_remain = mb_size = 0;
+							mb_remain = 0;
 						}
 					}
+					mb_size = 0;
 				}
 			} else {
 				/* invalid utf-8 sequence */
 				if (error) {
-					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
+					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
 					if (items_read)
 						*items_read = in_pos;
 					return -1;
@@ -246,6 +254,8 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
 	out_pos = 0;
 	surrogate = FALSE;
 
+	if (items_written)
+		*items_written = 0;
 	utf8_len = utf16_to_utf8_len (str, len, items_read, error);
 	if (error)
 		if (*error)
diff --git a/eglib/test/utf8.c b/eglib/test/utf8.c
index eff6ef3d46d..5b4c2608634 100644
--- a/eglib/test/utf8.c
+++ b/eglib/test/utf8.c
@@ -82,11 +82,10 @@ compare_utf16_to_utf8 (const gchar *expected, const gunichar2 *utf16, glong len_
 RESULT
 test_utf16_to_utf8 ()
 {
-	const gchar *src0 = "", *src1 = "ABCDE";
-	gunichar2 str0 [1], str1 [6];
+	const gchar *src0 = "", *src1 = "ABCDE", *src2 = "\xE5\xB9\xB4\x27";
+	gunichar2 str0 [] = {0}, str1 [6], str2 [] = {0x5E74, 39, 0};
 	RESULT result;
 
-	str0 [0] = 0;
 	gchar_to_gunichar2 (str1, src1);
 
 	/* empty string */
@@ -97,6 +96,9 @@ test_utf16_to_utf8 ()
 	result = compare_utf16_to_utf8 (src1, str1, 5, 5);
 	if (result != OK)
 		return result;
+	result = compare_utf16_to_utf8 (src2, str2, 2, 4);
+	if (result != OK)
+		return result;
 
 	return OK;
 }
@@ -174,28 +176,35 @@ compare_utf8_to_utf16 (const gunichar2 *expected, const gchar *utf8, glong len_i
 RESULT
 test_utf8_seq ()
 {
-	const gchar *src = "\345\271\264\47";
+	const gchar *src = "\xE5\xB9\xB4\x27";
 	glong in_read, out_read;
 	//gunichar2 expected [6];
 	GError *error = NULL;
+	gunichar2 *dst;
 
 	printf ("got: %s\n", src);
-	g_utf8_to_utf16 (src, strlen (src), &in_read, &out_read, &error);
+	dst = g_utf8_to_utf16 (src, strlen (src), &in_read, &out_read, &error);
 	if (error != NULL){
 		return error->message;
 	}
-	
+
+	if (in_read != 4) {
+		return FAILED ("in_read is expected to be 4 but was %d\n", in_read);
+	}
+	if (out_read != 2) {
+		return FAILED ("out_read is expected to be 2 but was %d\n", out_read);
+	}
+
 	return OK;
 }
 
 RESULT
 test_utf8_to_utf16 ()
 {
-	const gchar *src0 = "", *src1 = "ABCDE";
-	gunichar2 str0 [1], str1 [6];
+	const gchar *src0 = "", *src1 = "ABCDE", *src2 = "\xE5\xB9\xB4\x27";
+	gunichar2 str0 [] = {0}, str1 [6], str2 [] = {0x5E74, 39, 0};
 	RESULT result;
 
-	str0 [0] = 0;
 	gchar_to_gunichar2 (str1, src1);
 
 	/* empty string */
@@ -206,6 +215,9 @@ test_utf8_to_utf16 ()
 	result = compare_utf8_to_utf16 (str1, src1, 5, 5);
 	if (result != OK)
 		return result;
+	result = compare_utf8_to_utf16 (str2, src2, 4, 2);
+	if (result != OK)
+		return result;
 
 	return OK;
 }
author	Atsushi Eno <atsushieno@gmail.com>	2006-10-31 07:01:04 +0300
committer	Atsushi Eno <atsushieno@gmail.com>	2006-10-31 07:01:04 +0300
commit	0afb42d28e8817715b3eadeec95c41591a5dcd74 (patch)
tree	91d1affe8919cc1134ebecd9f357a45dfae6bb83 /eglib
parent	9388e8a295f8f86d35e31f824d95acb0fe17fd60 (diff)