Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/mono.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/eglib
diff options
context:
space:
mode:
authorMiguel de Icaza <miguel@gnome.org>2006-10-07 00:30:29 +0400
committerMiguel de Icaza <miguel@gnome.org>2006-10-07 00:30:29 +0400
commit3eead680b0fca6a8da410d3b33facd763bf036e3 (patch)
treeaf86d7e726f0d3ae5851b99852f057ed142387e6 /eglib
parent85f44228d3d99024b84af4152fead771e6c945b8 (diff)
2006-10-06 Miguel de Icaza <miguel@novell.com>
* src/gunicode.c (g_convert): Add bytes_read, bytes_written support; Small fixes to avoid valgrind errors. (g_utf8_validate): Add, based on the libxml code. gstr.c: implement g_strdown. svn path=/trunk/mono/; revision=66375
Diffstat (limited to 'eglib')
-rw-r--r--eglib/ChangeLog9
-rw-r--r--eglib/TODO7
-rw-r--r--eglib/src/glib.h3
-rw-r--r--eglib/src/gstr.c10
-rw-r--r--eglib/src/gunicode.c112
-rw-r--r--eglib/test/utf8.c16
6 files changed, 138 insertions, 19 deletions
diff --git a/eglib/ChangeLog b/eglib/ChangeLog
index b997469066f..49db413b28b 100644
--- a/eglib/ChangeLog
+++ b/eglib/ChangeLog
@@ -1,3 +1,12 @@
+2006-10-06 Miguel de Icaza <miguel@novell.com>
+
+ * src/gunicode.c (g_convert): Add bytes_read, bytes_written
+ support; Small fixes to avoid valgrind errors.
+
+ (g_utf8_validate): Add, based on the libxml code.
+
+ gstr.c: implement g_strdown.
+
2006-10-05 Miguel de Icaza <miguel@novell.com>
* src/gunicode.c: Some work in progress to implement g_convert
diff --git a/eglib/TODO b/eglib/TODO
index b517ec0237a..18d5f2fffb2 100644
--- a/eglib/TODO
+++ b/eglib/TODO
@@ -25,16 +25,10 @@ Important Groups:
* String manipulation
1 g_filename_from_utf8 [LIMITATION: UTF8 only today]
1 g_ascii_strdown
- 4 g_strdown
10 g_strlcpy
2 g_strdelimit
1 g_strescape
- * Character encoding
- 3 g_utf8_validate
- 3 g_locale_to_utf8
- 1 g_locale_from_utf8
-
* Miscelaneous
3 g_spaced_primes_closest
2 g_printerr
@@ -56,7 +50,6 @@ Macros:
* Unimplemented, not supported currently:
- 3 g_convert (used for MONO_EXTERNAL_ENCODINGS)
g_unichar_tolower Used for deprecated unmanaged string collation
g_unichar_type Used for deprecated unmanaged string collation
diff --git a/eglib/src/glib.h b/eglib/src/glib.h
index a6098037ba7..f124309c967 100644
--- a/eglib/src/glib.h
+++ b/eglib/src/glib.h
@@ -651,11 +651,14 @@ gboolean g_get_charset (char **charset);
gchar *g_locale_to_utf8 (const gchar *opsysstring, gssize len,
gsize *bytes_read, gsize *bytes_written,
GError **error);
+gchar *g_locale_from_utf8 (const gchar *utf8string, gssize len, gsize *bytes_read,
+ gsize *bytes_written, GError **error);
gchar *g_filename_from_utf8 (const gchar *utf8string, gssize len, gsize *bytes_read,
gsize *bytes_written, GError **error);
gchar *g_convert (const gchar *str, gssize len,
const gchar *to_codeset, const gchar *from_codeset,
gsize *bytes_read, gsize *bytes_written, GError **error);
+gboolean g_utf8_validate (const gchar *str, gssize max_len, const gchar **end);
#endif
diff --git a/eglib/src/gstr.c b/eglib/src/gstr.c
index ecb30611252..f837972f70b 100644
--- a/eglib/src/gstr.c
+++ b/eglib/src/gstr.c
@@ -475,3 +475,13 @@ g_filename_from_uri (const gchar *uri, gchar **hostname, GError **error)
}
return result;
}
+
+void
+g_strdown (gchar *string)
+{
+ g_return_if_fail (string != NULL);
+
+ while (*string){
+ *string = tolower (*string);
+ }
+}
diff --git a/eglib/src/gunicode.c b/eglib/src/gunicode.c
index fd7603815f0..3833ba738e5 100644
--- a/eglib/src/gunicode.c
+++ b/eglib/src/gunicode.c
@@ -6,6 +6,13 @@
*
* (C) 2006 Novell, Inc.
*
+ * utf8 validation code came from:
+ * libxml2-2.6.26 licensed under the MIT X11 license
+ *
+ * Authors credit in libxml's string.c:
+ * William Brack <wbrack@mmm.com.hk>
+ * daniel@veillard.com
+ *
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
@@ -24,6 +31,7 @@
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
*/
#include <stdio.h>
#include <glib.h>
@@ -55,6 +63,7 @@ g_convert (const gchar *str, gssize len,
{
iconv_t convertor;
char *buffer, *result, *output;
+ const char *strptr = (const char *) str;
int str_len = len == -1 ? strlen (str) : len;
int buffer_size;
size_t left, out_left;
@@ -66,19 +75,21 @@ g_convert (const gchar *str, gssize len,
return NULL;
}
- buffer_size = out_left = str_len + 1;
- buffer = g_malloc (out_left);
+ buffer_size = str_len + 1 + 8;
+ buffer = g_malloc (buffer_size);
+ out_left = str_len;
output = buffer;
left = str_len;
while (left > 0){
- int res = iconv (convertor, (char **) &str, &left, &output, &out_left);
+ int res = iconv (convertor, (char **) &strptr, &left, &output, &out_left);
if (res == (size_t) -1){
- int out_size = buffer_size - out_left;
-
if (errno == E2BIG){
char *n;
-
- buffer_size += left + 2;
+ int extra_space = 8 + left;
+ int output_used = output - buffer;
+
+ buffer_size += extra_space;
+
n = g_realloc (buffer, buffer_size);
if (n == NULL){
@@ -89,8 +100,8 @@ g_convert (const gchar *str, gssize len,
goto leave;
}
buffer = n;
- output = buffer + out_size;
- out_left = buffer_size - out_size;
+ out_left += extra_space;
+ output = buffer + output_used;
} else if (errno == EILSEQ){
if (error != NULL)
*error = g_error_new (NULL, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Invalid multi-byte sequence on input");
@@ -106,6 +117,10 @@ g_convert (const gchar *str, gssize len,
}
}
}
+ if (bytes_read != NULL)
+ *bytes_read = strptr - str;
+ if (bytes_written != NULL)
+ *bytes_written = output - buffer;
*output = 0;
result = buffer;
leave:
@@ -151,3 +166,82 @@ g_locale_to_utf8 (const gchar *opsysstring, gssize len, gsize *bytes_read, gsize
return g_convert (opsysstring, len, "UTF-8", my_charset, bytes_read, bytes_written, error);
}
+gchar *
+g_locale_from_utf8 (const gchar *utf8string, gssize len, gsize *bytes_read, gsize *bytes_written, GError **error)
+{
+ g_get_charset (NULL);
+
+ return g_convert (utf8string, len, my_charset, "UTF-8", bytes_read, bytes_written, error);
+}
+/**
+ * g_utf8_validate
+ * @utf: Pointer to putative UTF-8 encoded string.
+ *
+ * Checks @utf for being valid UTF-8. @utf is assumed to be
+ * null-terminated. This function is not super-strict, as it will
+ * allow longer UTF-8 sequences than necessary. Note that Java is
+ * capable of producing these sequences if provoked. Also note, this
+ * routine checks for the 4-byte maximum size, but does not check for
+ * 0x10ffff maximum value.
+ *
+ * Return value: true if @utf is valid.
+ **/
+gboolean
+g_utf8_validate (const gchar *utf, gssize max_len, const gchar **end)
+{
+ int ix;
+
+ g_return_val_if_fail (utf != NULL, FALSE);
+
+ if (max_len == -1)
+ max_len = strlen (utf);
+
+ /*
+ * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
+ * are as follows (in "bit format"):
+ * 0xxxxxxx valid 1-byte
+ * 110xxxxx 10xxxxxx valid 2-byte
+ * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
+ * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
+ */
+ for (ix = 0; ix < max_len;) { /* string is 0-terminated */
+ unsigned char c;
+
+ c = utf[ix];
+ if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
+ ix++;
+ } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
+ if (((ix+1) >= max_len) || (utf[ix+1] & 0xc0 ) != 0x80){
+ if (end != NULL)
+ *end = &utf [ix];
+ return FALSE;
+ }
+ ix += 2;
+ } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
+ if (((ix + 2) >= max_len) ||
+ ((utf[ix+1] & 0xc0) != 0x80) ||
+ ((utf[ix+2] & 0xc0) != 0x80)){
+ if (end != NULL)
+ *end = &utf [ix];
+ return FALSE;
+ }
+ ix += 3;
+ } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
+ if (((ix + 3) >= max_len) ||
+ ((utf[ix+1] & 0xc0) != 0x80) ||
+ ((utf[ix+2] & 0xc0) != 0x80) ||
+ ((utf[ix+3] & 0xc0) != 0x80)){
+ if (end != NULL)
+ *end = &utf [ix];
+ return FALSE;
+ }
+ ix += 4;
+ } else {/* unknown encoding */
+ if (end != NULL)
+ *end = &utf [ix];
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
diff --git a/eglib/test/utf8.c b/eglib/test/utf8.c
index 01f5482e3b9..303f3620852 100644
--- a/eglib/test/utf8.c
+++ b/eglib/test/utf8.c
@@ -197,9 +197,19 @@ RESULT
test_convert ()
{
gsize n;
- char *s = g_convert ("\242", -1, "UTF-8", "ISO-8859-1", NULL, &n, NULL);
-
- printf ("Result: %s %d\n", s, strlen (s));
+ char *s = g_convert ("\242\241\243\242\241\243\242\241\243\242\241\243", -1, "UTF-8", "ISO-8859-1", NULL, &n, NULL);
+ guchar *u = (guchar *) s;
+
+ if (strlen (s) != 24)
+ return FAILED ("Expected 24 bytes, got: %d", strlen (s));
+
+ if (u [1] != 162 || u [2] != 194 ||
+ u [3] != 161 || u [4] != 194 ||
+ u [5] != 163 || u [6] != 194)
+ return FAILED ("Incorrect conversion");
+
+ g_free (s);
+
return OK;
}