2006-10-06 Miguel de Icaza <miguel@novell.com>

* src/gunicode.c (g_convert): Add bytes_read, bytes_written support; Small fixes to avoid valgrind errors. (g_utf8_validate): Add, based on the libxml code. gstr.c: implement g_strdown. svn path=/trunk/mono/; revision=66375
author: Miguel de Icaza <miguel@gnome.org> 2006-10-07 00:30:29 +0400
committer: Miguel de Icaza <miguel@gnome.org> 2006-10-07 00:30:29 +0400
commit: 3eead680b0fca6a8da410d3b33facd763bf036e3 (patch)
tree: af86d7e726f0d3ae5851b99852f057ed142387e6 /eglib
parent: 85f44228d3d99024b84af4152fead771e6c945b8 (diff)
6 files changed, 138 insertions, 19 deletions
diff --git a/eglib/ChangeLog b/eglib/ChangeLog
index b997469066f..49db413b28b 100644
--- a/eglib/ChangeLog
+++ b/eglib/ChangeLog
@@ -1,3 +1,12 @@
+2006-10-06  Miguel de Icaza  <miguel@novell.com>
+
+	* src/gunicode.c (g_convert): Add bytes_read, bytes_written
+	support;   Small fixes to avoid valgrind errors. 
+	
+	(g_utf8_validate): Add, based on the libxml code.
+
+	gstr.c: implement g_strdown. 
+
 2006-10-05  Miguel de Icaza  <miguel@novell.com>
 
 	* src/gunicode.c: Some work in progress to implement g_convert
diff --git a/eglib/TODO b/eglib/TODO
index b517ec0237a..18d5f2fffb2 100644
--- a/eglib/TODO
+++ b/eglib/TODO
@@ -25,16 +25,10 @@ Important Groups:
 	* String manipulation
 	      1 g_filename_from_utf8	[LIMITATION: UTF8 only today]
 	      1 g_ascii_strdown
-	      4 g_strdown
 	     10 g_strlcpy
 	      2 g_strdelimit
 	      1 g_strescape
 	
-	* Character encoding
-	      3 g_utf8_validate
-	      3 g_locale_to_utf8
-	      1 g_locale_from_utf8
-
 	* Miscelaneous
 	      3 g_spaced_primes_closest
 	      2 g_printerr
@@ -56,7 +50,6 @@ Macros:
 
 * Unimplemented, not supported currently:
 
-	      3 g_convert   	 	(used for MONO_EXTERNAL_ENCODINGS)
 		g_unichar_tolower	Used for deprecated unmanaged string collation
 		g_unichar_type		Used for deprecated unmanaged string collation
 
diff --git a/eglib/src/glib.h b/eglib/src/glib.h
index a6098037ba7..f124309c967 100644
--- a/eglib/src/glib.h
+++ b/eglib/src/glib.h
@@ -651,11 +651,14 @@ gboolean  g_get_charset        (char **charset);
 gchar    *g_locale_to_utf8     (const gchar *opsysstring, gssize len,
 				gsize *bytes_read, gsize *bytes_written,
 				GError **error);
+gchar    *g_locale_from_utf8   (const gchar *utf8string, gssize len, gsize *bytes_read,
+				gsize *bytes_written, GError **error);
 gchar    *g_filename_from_utf8 (const gchar *utf8string, gssize len, gsize *bytes_read,
 				gsize *bytes_written, GError **error);
 gchar    *g_convert            (const gchar *str, gssize len,
 				const gchar *to_codeset, const gchar *from_codeset,
 				gsize *bytes_read, gsize *bytes_written, GError **error);
+gboolean  g_utf8_validate      (const gchar *str, gssize max_len, const gchar **end);
 
 
 #endif
diff --git a/eglib/src/gstr.c b/eglib/src/gstr.c
index ecb30611252..f837972f70b 100644
--- a/eglib/src/gstr.c
+++ b/eglib/src/gstr.c
@@ -475,3 +475,13 @@ g_filename_from_uri (const gchar *uri, gchar **hostname, GError **error)
 	}
 	return result;
 }
+
+void
+g_strdown (gchar *string)
+{
+	g_return_if_fail (string != NULL);
+
+	while (*string){
+		*string = tolower (*string);
+	}
+}
diff --git a/eglib/src/gunicode.c b/eglib/src/gunicode.c
index fd7603815f0..3833ba738e5 100644
--- a/eglib/src/gunicode.c
+++ b/eglib/src/gunicode.c
@@ -6,6 +6,13 @@
  *
  * (C) 2006 Novell, Inc.
  *
+ * utf8 validation code came from:
+ * 	libxml2-2.6.26 licensed under the MIT X11 license
+ *
+ * Authors credit in libxml's string.c:
+ *   William Brack <wbrack@mmm.com.hk>
+ *   daniel@veillard.com
+ *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -24,6 +31,7 @@
  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
  */
 #include <stdio.h>
 #include <glib.h>
@@ -55,6 +63,7 @@ g_convert (const gchar *str, gssize len,
 {
 	iconv_t convertor;
 	char *buffer, *result, *output;
+	const char *strptr = (const char *) str;
 	int str_len = len == -1 ? strlen (str) : len;
 	int buffer_size;
 	size_t left, out_left;
@@ -66,19 +75,21 @@ g_convert (const gchar *str, gssize len,
 		return NULL;
 	}
 
-	buffer_size = out_left = str_len + 1;
-	buffer = g_malloc (out_left);
+	buffer_size = str_len + 1 + 8;
+	buffer = g_malloc (buffer_size);
+	out_left = str_len;
 	output = buffer;
 	left = str_len;
 	while (left > 0){
-		int res = iconv (convertor, (char **) &str, &left, &output, &out_left);
+		int res = iconv (convertor, (char **) &strptr, &left, &output, &out_left);
 		if (res == (size_t) -1){
-			int out_size = buffer_size - out_left;
-			
 			if (errno == E2BIG){
 				char *n;
-
-				buffer_size += left + 2;
+				int extra_space = 8 + left;
+				int output_used = output - buffer;
+				
+				buffer_size += extra_space;
+				
 				n = g_realloc (buffer, buffer_size);
 				
 				if (n == NULL){
@@ -89,8 +100,8 @@ g_convert (const gchar *str, gssize len,
 					goto leave;
 				}
 				buffer = n;
-				output = buffer + out_size;
-				out_left = buffer_size - out_size;
+				out_left += extra_space;
+				output = buffer + output_used;
 			} else if (errno == EILSEQ){
 				if (error != NULL)
 					*error = g_error_new (NULL, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Invalid multi-byte sequence on input");
@@ -106,6 +117,10 @@ g_convert (const gchar *str, gssize len,
 			}
 		} 
 	}
+	if (bytes_read != NULL)
+		*bytes_read = strptr - str;
+	if (bytes_written != NULL)
+		*bytes_written = output - buffer;
 	*output = 0;
 	result = buffer;
  leave:
@@ -151,3 +166,82 @@ g_locale_to_utf8 (const gchar *opsysstring, gssize len, gsize *bytes_read, gsize
 	return g_convert (opsysstring, len, "UTF-8", my_charset, bytes_read, bytes_written, error);
 }
 
+gchar *
+g_locale_from_utf8 (const gchar *utf8string, gssize len, gsize *bytes_read, gsize *bytes_written, GError **error)
+{
+	g_get_charset (NULL);
+
+	return g_convert (utf8string, len, my_charset, "UTF-8", bytes_read, bytes_written, error);
+}
+/**
+ * g_utf8_validate
+ * @utf: Pointer to putative UTF-8 encoded string.
+ *
+ * Checks @utf for being valid UTF-8. @utf is assumed to be
+ * null-terminated. This function is not super-strict, as it will
+ * allow longer UTF-8 sequences than necessary. Note that Java is
+ * capable of producing these sequences if provoked. Also note, this
+ * routine checks for the 4-byte maximum size, but does not check for
+ * 0x10ffff maximum value.
+ *
+ * Return value: true if @utf is valid.
+ **/
+gboolean
+g_utf8_validate (const gchar *utf, gssize max_len, const gchar **end)
+{
+	int ix;
+	
+	g_return_val_if_fail (utf != NULL, FALSE);
+
+	if (max_len == -1)
+		max_len = strlen (utf);
+	
+	/*
+	 * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
+	 * are as follows (in "bit format"):
+	 *    0xxxxxxx                                      valid 1-byte
+	 *    110xxxxx 10xxxxxx                             valid 2-byte
+	 *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
+	 *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
+	 */
+	for (ix = 0; ix < max_len;) {      /* string is 0-terminated */
+		unsigned char c;
+		
+		c = utf[ix];
+		if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
+			ix++;
+		} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
+			if (((ix+1) >= max_len) || (utf[ix+1] & 0xc0 ) != 0x80){
+				if (end != NULL)
+					*end = &utf [ix];
+				return FALSE;
+			}
+			ix += 2;
+		} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
+			if (((ix + 2) >= max_len) || 
+			    ((utf[ix+1] & 0xc0) != 0x80) ||
+			    ((utf[ix+2] & 0xc0) != 0x80)){
+				if (end != NULL)
+					*end = &utf [ix];
+				return FALSE;
+			}
+			ix += 3;
+		} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
+			if (((ix + 3) >= max_len) ||
+			    ((utf[ix+1] & 0xc0) != 0x80) ||
+			    ((utf[ix+2] & 0xc0) != 0x80) ||
+			    ((utf[ix+3] & 0xc0) != 0x80)){
+				if (end != NULL)
+					*end = &utf [ix];
+				return FALSE;
+			}
+			ix += 4;
+		} else {/* unknown encoding */
+			if (end != NULL)
+				*end = &utf [ix];
+			return FALSE;
+		}
+	}
+	
+	return TRUE;
+}
diff --git a/eglib/test/utf8.c b/eglib/test/utf8.c
index 01f5482e3b9..303f3620852 100644
--- a/eglib/test/utf8.c
+++ b/eglib/test/utf8.c
@@ -197,9 +197,19 @@ RESULT
 test_convert ()
 {
 	gsize n;
-	char *s = g_convert ("\242", -1, "UTF-8", "ISO-8859-1", NULL, &n, NULL);
-
-	printf ("Result: %s %d\n", s, strlen (s));
+	char *s = g_convert ("\242\241\243\242\241\243\242\241\243\242\241\243", -1, "UTF-8", "ISO-8859-1", NULL, &n, NULL);
+	guchar *u = (guchar *) s;
+	
+	if (strlen (s) != 24)
+		return FAILED ("Expected 24 bytes, got: %d", strlen (s));
+
+	if (u [1] != 162 || u [2] != 194 ||
+	    u [3] != 161 || u [4] != 194 ||
+	    u [5] != 163 || u [6] != 194)
+		return FAILED ("Incorrect conversion");
+	
+	g_free (s);
+	
 	return OK;
 }
author	Miguel de Icaza <miguel@gnome.org>	2006-10-07 00:30:29 +0400
committer	Miguel de Icaza <miguel@gnome.org>	2006-10-07 00:30:29 +0400
commit	3eead680b0fca6a8da410d3b33facd763bf036e3 (patch)
tree	af86d7e726f0d3ae5851b99852f057ed142387e6 /eglib
parent	85f44228d3d99024b84af4152fead771e6c945b8 (diff)