1 files changed, 93 insertions, 70 deletions
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index 96033615cf5..229a97a2fa7 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -47,6 +47,19 @@
 
 // #define DEBUG_STRSIZE
 
+/* array copied from glib's gutf8.c, */
+/* Note: last two values (0xfe and 0xff) are forbidden in utf-8, so they are considered 1 byte length too. */
+static const size_t utf8_skip_data[256] = {
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
+};
+
 /* from libswish3, originally called u8_isvalid(),
  * modified to return the index of the bad character (byte index not utf).
  * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
@@ -56,73 +69,91 @@
  * length is in bytes, since without knowing whether the string is valid
  * it's hard to know how many characters there are! */
 
-static const char trailingBytesForUTF8[256] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
-};
-
-int BLI_utf8_invalid_byte(const char *str, int length)
+/**
+ * Find first utf-8 invalid byte in given \a str, of \a length bytes.
+ *
+ * \return the offset of the first invalid byte.
+ */
+ptrdiff_t BLI_utf8_invalid_byte(const char *str, size_t length)
 {
-	const unsigned char *p, *pend = (const unsigned char *)str + length;
+	const unsigned char *p, *perr, *pend = (const unsigned char *)str + length;
 	unsigned char c;
 	int ab;
 
-	for (p = (const unsigned char *)str; p < pend; p++) {
+	for (p = (const unsigned char *)str; p < pend; p++, length--) {
 		c = *p;
+		perr = p;  /* Erroneous char is always the first of an invalid utf8 sequence... */
+		if (ELEM(c, 0xfe, 0xff, 0x00))  /* Those three values are not allowed in utf8 string. */
+			goto utf8_error;
 		if (c < 128)
 			continue;
 		if ((c & 0xc0) != 0xc0)
 			goto utf8_error;
-		ab = trailingBytesForUTF8[c];
-		if (length < ab)
+
+		/* Note that since we always increase p (and decrease length) by one byte in main loop, we only add/subtract
+		 * extra utf8 bytes in code below
+		 * (ab number, aka number of bytes remaining in the utf8 sequence after the initial one). */
+		ab = (int)utf8_skip_data[c] - 1;
+		if (length <= ab) {
 			goto utf8_error;
-		length -= ab;
+		}
 
-		p++;
 		/* Check top bits in the second byte */
+		p++;
+		length--;
 		if ((*p & 0xc0) != 0x80)
 			goto utf8_error;
 
 		/* Check for overlong sequences for each different length */
 		switch (ab) {
-			/* Check for xx00 000x */
-		case 1:
-			if ((c & 0x3e) == 0) goto utf8_error;
-			continue;   /* We know there aren't any more bytes to check */
-
-			/* Check for 1110 0000, xx0x xxxx */
-		case 2:
-			if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
-			break;
-
-			/* Check for 1111 0000, xx00 xxxx */
-		case 3:
-			if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
-			break;
-
-			/* Check for 1111 1000, xx00 0xxx */
-		case 4:
-			if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
-			break;
-
-			/* Check for leading 0xfe or 0xff,
-			 * and then for 1111 1100, xx00 00xx */
-		case 5:
-			if (c == 0xfe || c == 0xff ||
-			    (c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error;
-			break;
+			case 1:
+				/* Check for xx00 000x */
+				if ((c & 0x3e) == 0) goto utf8_error;
+				continue;   /* We know there aren't any more bytes to check */
+
+			case 2:
+				/* Check for 1110 0000, xx0x xxxx */
+				if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
+				/* Some special cases, see section 5 of utf-8 decoder stress-test by Markus Kuhn
+				 * (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt). */
+				/* From section 5.1 (and 5.2) */
+				if (c == 0xed) {
+					if (*p == 0xa0 && *(p + 1) == 0x80) goto utf8_error;
+					if (*p == 0xad && *(p + 1) == 0xbf) goto utf8_error;
+					if (*p == 0xae && *(p + 1) == 0x80) goto utf8_error;
+					if (*p == 0xaf && *(p + 1) == 0xbf) goto utf8_error;
+					if (*p == 0xb0 && *(p + 1) == 0x80) goto utf8_error;
+					if (*p == 0xbe && *(p + 1) == 0x80) goto utf8_error;
+					if (*p == 0xbf && *(p + 1) == 0xbf) goto utf8_error;
+				}
+				/* From section 5.3 */
+				if (c == 0xef) {
+					if (*p == 0xbf && *(p + 1) == 0xbe) goto utf8_error;
+					if (*p == 0xbf && *(p + 1) == 0xbf) goto utf8_error;
+				}
+				break;
+
+			case 3:
+				/* Check for 1111 0000, xx00 xxxx */
+				if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
+				break;
+
+			case 4:
+				/* Check for 1111 1000, xx00 0xxx */
+				if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
+				break;
+
+			case 5:
+				/* Check for 1111 1100, xx00 00xx */
+				if (c == 0xfc && (*p & 0x3c) == 0) goto utf8_error;
+				break;
 		}
 
 		/* Check for valid bytes after the 2nd, if any; all must start 10 */
 		while (--ab > 0) {
-			if ((*(p + 1) & 0xc0) != 0x80) goto utf8_error;
-			p++; /* do this after so we get usable offset - campbell */
+			p++;
+			length--;
+			if ((*p & 0xc0) != 0x80) goto utf8_error;
 		}
 	}
 
@@ -130,18 +161,24 @@ int BLI_utf8_invalid_byte(const char *str, int length)
 
 utf8_error:
 
-	return (int)((const char *)p - (const char *)str) - 1;
+	return ((const char *)perr - (const char *)str);
 }
 
-int BLI_utf8_invalid_strip(char *str, int length)
+/**
+ * Remove any invalid utf-8 byte (taking into account multi-bytes sequence of course).
+ *
+ * \return number of stripped bytes.
+ */
+int BLI_utf8_invalid_strip(char *str, size_t length)
 {
-	int bad_char, tot = 0;
+	ptrdiff_t bad_char;
+	int tot = 0;
 
 	BLI_assert(str[length] == '\0');
 
 	while ((bad_char = BLI_utf8_invalid_byte(str, length)) != -1) {
 		str += bad_char;
-		length -= bad_char;
+		length -= (size_t)(bad_char + 1);
 
 		if (length == 0) {
 			/* last character bad, strip it */
@@ -151,7 +188,7 @@ int BLI_utf8_invalid_strip(char *str, int length)
 		}
 		else {
 			/* strip, keep looking */
-			memmove(str, str + 1, (size_t)length);
+			memmove(str, str + 1, length + 1);  /* +1 for NULL char! */
 			tot++;
 		}
 	}
@@ -162,31 +199,17 @@ int BLI_utf8_invalid_strip(char *str, int length)
 
 /* compatible with BLI_strncpy, but esnure no partial utf8 chars */
 
-/* array copied from glib's gutf8.c,
- * note: this looks to be at odd's with 'trailingBytesForUTF8',
- * need to find out what gives here! - campbell */
-static const size_t utf8_skip_data[256] = {
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
-};
-
 #define BLI_STR_UTF8_CPY(dst, src, maxncpy)                                   \
 	{                                                                         \
 		size_t utf8_size;                                                     \
 		while (*src != '\0' && (utf8_size = utf8_skip_data[*src]) < maxncpy) {\
 			maxncpy -= utf8_size;                                             \
 			switch (utf8_size) {                                              \
-				case 6: *dst ++ = *src ++;                                    \
-				case 5: *dst ++ = *src ++;                                    \
-				case 4: *dst ++ = *src ++;                                    \
-				case 3: *dst ++ = *src ++;                                    \
-				case 2: *dst ++ = *src ++;                                    \
+				case 6: *dst ++ = *src ++; ATTR_FALLTHROUGH;                  \
+				case 5: *dst ++ = *src ++; ATTR_FALLTHROUGH;                  \
+				case 4: *dst ++ = *src ++; ATTR_FALLTHROUGH;                  \
+				case 3: *dst ++ = *src ++; ATTR_FALLTHROUGH;                  \
+				case 2: *dst ++ = *src ++; ATTR_FALLTHROUGH;                  \
 				case 1: *dst ++ = *src ++;                                    \
 			}                                                                 \
 		}                                                                     \