python/utf8 compatibility fixes. (as discussed on the mailing list)

- user input gets non utf8 chars stripped all text input other then file paths. - python has the same limitations, it will raise an error on non utf8 strings except for paths use unicode escape literals so its possible to deal with saving to these file paths from python. - new string functions BLI_utf8_invalid_byte(str, len) returns the first invalid utf8 byte or -1 on on success. BLI_utf8_invalid_strip(str, len) strips non utf-8 chars.
author: Campbell Barton <ideasman42@gmail.com> 2010-08-28 16:34:22 +0400
committer: Campbell Barton <ideasman42@gmail.com> 2010-08-28 16:34:22 +0400
commit: f28b5e672ed05bb08b317c78a36b31e7c49d4f4e (patch)
tree: e326ad5f429f01c63878989d4ca393aa9bc65051 /source/blender/blenlib/intern/string.c
parent: 7a7076c878f7f5f5a28653a9a0df76065948ea7a (diff)
1 files changed, 111 insertions, 0 deletions
diff --git a/source/blender/blenlib/intern/string.c b/source/blender/blenlib/intern/string.c
index c344d8c0711..76193ba9a13 100644
--- a/source/blender/blenlib/intern/string.c
+++ b/source/blender/blenlib/intern/string.c
@@ -348,3 +348,114 @@ size_t BLI_strnlen(const char *str, size_t maxlen)
 	const char *end = memchr(str, '\0', maxlen);
 	return end ? (size_t) (end - str) : maxlen;
 }
+
+/* from libswish3, originally called u8_isvalid(),
+ * modified to return the index of the bad character (byte index not utf).
+ * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
+
+/* based on the valid_utf8 routine from the PCRE library by Philip Hazel
+
+   length is in bytes, since without knowing whether the string is valid
+   it's hard to know how many characters there are! */
+
+static const char trailingBytesForUTF8[256] = {
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+};
+
+int BLI_utf8_invalid_byte(const char *str, int length)
+{
+    const unsigned char *p, *pend = (unsigned char*)str + length;
+    unsigned char c;
+    int ab;
+
+    for (p = (unsigned char*)str; p < pend; p++) {
+        c = *p;
+        if (c < 128)
+            continue;
+        if ((c & 0xc0) != 0xc0)
+            goto utf8_error;
+        ab = trailingBytesForUTF8[c];
+        if (length < ab)
+            goto utf8_error;
+        length -= ab;
+
+        p++;
+        /* Check top bits in the second byte */
+        if ((*p & 0xc0) != 0x80)
+            goto utf8_error;
+
+        /* Check for overlong sequences for each different length */
+        switch (ab) {
+            /* Check for xx00 000x */
+        case 1:
+            if ((c & 0x3e) == 0) goto utf8_error;
+            continue;   /* We know there aren't any more bytes to check */
+
+            /* Check for 1110 0000, xx0x xxxx */
+        case 2:
+            if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
+            break;
+
+            /* Check for 1111 0000, xx00 xxxx */
+        case 3:
+            if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
+            break;
+
+            /* Check for 1111 1000, xx00 0xxx */
+        case 4:
+            if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
+            break;
+
+            /* Check for leading 0xfe or 0xff,
+               and then for 1111 1100, xx00 00xx */
+        case 5:
+            if (c == 0xfe || c == 0xff ||
+                (c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error;
+            break;
+        }
+
+        /* Check for valid bytes after the 2nd, if any; all must start 10 */
+        while (--ab > 0) {
+            if ((*(p+1) & 0xc0) != 0x80) goto utf8_error;
+			p++; /* do this after so we get usable offset - campbell */
+        }
+    }
+
+    return -1;
+
+utf8_error:
+
+	return (int)((char *)p - (char *)str) - 1;
+}
+
+int BLI_utf8_invalid_strip(char *str, int length)
+{
+	int bad_char, tot= 0;
+
+	while((bad_char= BLI_utf8_invalid_byte(str, length)) != -1) {
+		str += bad_char;
+		length -= bad_char;
+
+		if(length == 0) {
+			/* last character bad, strip it */
+			*str= '\0';
+			tot++;
+			break;
+		}
+		else {
+			/* strip, keep looking */
+			memmove(str, str + 1, length);
+			tot++;
+		}
+	}
+
+	return tot;
+}
+
author	Campbell Barton <ideasman42@gmail.com>	2010-08-28 16:34:22 +0400
committer	Campbell Barton <ideasman42@gmail.com>	2010-08-28 16:34:22 +0400
commit	f28b5e672ed05bb08b317c78a36b31e7c49d4f4e (patch)
tree	e326ad5f429f01c63878989d4ca393aa9bc65051 /source/blender/blenlib/intern/string.c
parent	7a7076c878f7f5f5a28653a9a0df76065948ea7a (diff)