Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'source/blender/blenlib/intern/string_utf8.c')
-rw-r--r--source/blender/blenlib/intern/string_utf8.c163
1 files changed, 93 insertions, 70 deletions
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index 96033615cf5..229a97a2fa7 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -47,6 +47,19 @@
// #define DEBUG_STRSIZE
+/* array copied from glib's gutf8.c, */
+/* Note: last two values (0xfe and 0xff) are forbidden in utf-8, so they are considered 1 byte length too. */
+static const size_t utf8_skip_data[256] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
+};
+
/* from libswish3, originally called u8_isvalid(),
* modified to return the index of the bad character (byte index not utf).
* http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
@@ -56,73 +69,91 @@
* length is in bytes, since without knowing whether the string is valid
* it's hard to know how many characters there are! */
-static const char trailingBytesForUTF8[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
-};
-
-int BLI_utf8_invalid_byte(const char *str, int length)
+/**
+ * Find first utf-8 invalid byte in given \a str, of \a length bytes.
+ *
+ * \return the offset of the first invalid byte.
+ */
+ptrdiff_t BLI_utf8_invalid_byte(const char *str, size_t length)
{
- const unsigned char *p, *pend = (const unsigned char *)str + length;
+ const unsigned char *p, *perr, *pend = (const unsigned char *)str + length;
unsigned char c;
int ab;
- for (p = (const unsigned char *)str; p < pend; p++) {
+ for (p = (const unsigned char *)str; p < pend; p++, length--) {
c = *p;
+ perr = p; /* Erroneous char is always the first of an invalid utf8 sequence... */
+ if (ELEM(c, 0xfe, 0xff, 0x00)) /* Those three values are not allowed in utf8 string. */
+ goto utf8_error;
if (c < 128)
continue;
if ((c & 0xc0) != 0xc0)
goto utf8_error;
- ab = trailingBytesForUTF8[c];
- if (length < ab)
+
+ /* Note that since we always increase p (and decrease length) by one byte in main loop, we only add/subtract
+ * extra utf8 bytes in code below
+ * (ab number, aka number of bytes remaining in the utf8 sequence after the initial one). */
+ ab = (int)utf8_skip_data[c] - 1;
+ if (length <= ab) {
goto utf8_error;
- length -= ab;
+ }
- p++;
/* Check top bits in the second byte */
+ p++;
+ length--;
if ((*p & 0xc0) != 0x80)
goto utf8_error;
/* Check for overlong sequences for each different length */
switch (ab) {
- /* Check for xx00 000x */
- case 1:
- if ((c & 0x3e) == 0) goto utf8_error;
- continue; /* We know there aren't any more bytes to check */
-
- /* Check for 1110 0000, xx0x xxxx */
- case 2:
- if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
- break;
-
- /* Check for 1111 0000, xx00 xxxx */
- case 3:
- if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
- break;
-
- /* Check for 1111 1000, xx00 0xxx */
- case 4:
- if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
- break;
-
- /* Check for leading 0xfe or 0xff,
- * and then for 1111 1100, xx00 00xx */
- case 5:
- if (c == 0xfe || c == 0xff ||
- (c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error;
- break;
+ case 1:
+ /* Check for xx00 000x */
+ if ((c & 0x3e) == 0) goto utf8_error;
+ continue; /* We know there aren't any more bytes to check */
+
+ case 2:
+ /* Check for 1110 0000, xx0x xxxx */
+ if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
+ /* Some special cases, see section 5 of utf-8 decoder stress-test by Markus Kuhn
+ * (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt). */
+ /* From section 5.1 (and 5.2) */
+ if (c == 0xed) {
+ if (*p == 0xa0 && *(p + 1) == 0x80) goto utf8_error;
+ if (*p == 0xad && *(p + 1) == 0xbf) goto utf8_error;
+ if (*p == 0xae && *(p + 1) == 0x80) goto utf8_error;
+ if (*p == 0xaf && *(p + 1) == 0xbf) goto utf8_error;
+ if (*p == 0xb0 && *(p + 1) == 0x80) goto utf8_error;
+ if (*p == 0xbe && *(p + 1) == 0x80) goto utf8_error;
+ if (*p == 0xbf && *(p + 1) == 0xbf) goto utf8_error;
+ }
+ /* From section 5.3 */
+ if (c == 0xef) {
+ if (*p == 0xbf && *(p + 1) == 0xbe) goto utf8_error;
+ if (*p == 0xbf && *(p + 1) == 0xbf) goto utf8_error;
+ }
+ break;
+
+ case 3:
+ /* Check for 1111 0000, xx00 xxxx */
+ if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
+ break;
+
+ case 4:
+ /* Check for 1111 1000, xx00 0xxx */
+ if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
+ break;
+
+ case 5:
+ /* Check for 1111 1100, xx00 00xx */
+ if (c == 0xfc && (*p & 0x3c) == 0) goto utf8_error;
+ break;
}
/* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0) {
- if ((*(p + 1) & 0xc0) != 0x80) goto utf8_error;
- p++; /* do this after so we get usable offset - campbell */
+ p++;
+ length--;
+ if ((*p & 0xc0) != 0x80) goto utf8_error;
}
}
@@ -130,18 +161,24 @@ int BLI_utf8_invalid_byte(const char *str, int length)
utf8_error:
- return (int)((const char *)p - (const char *)str) - 1;
+ return ((const char *)perr - (const char *)str);
}
-int BLI_utf8_invalid_strip(char *str, int length)
+/**
+ * Remove any invalid utf-8 byte (taking into account multi-bytes sequence of course).
+ *
+ * \return number of stripped bytes.
+ */
+int BLI_utf8_invalid_strip(char *str, size_t length)
{
- int bad_char, tot = 0;
+ ptrdiff_t bad_char;
+ int tot = 0;
BLI_assert(str[length] == '\0');
while ((bad_char = BLI_utf8_invalid_byte(str, length)) != -1) {
str += bad_char;
- length -= bad_char;
+ length -= (size_t)(bad_char + 1);
if (length == 0) {
/* last character bad, strip it */
@@ -151,7 +188,7 @@ int BLI_utf8_invalid_strip(char *str, int length)
}
else {
/* strip, keep looking */
- memmove(str, str + 1, (size_t)length);
+ memmove(str, str + 1, length + 1); /* +1 for NULL char! */
tot++;
}
}
@@ -162,31 +199,17 @@ int BLI_utf8_invalid_strip(char *str, int length)
/* compatible with BLI_strncpy, but esnure no partial utf8 chars */
-/* array copied from glib's gutf8.c,
- * note: this looks to be at odd's with 'trailingBytesForUTF8',
- * need to find out what gives here! - campbell */
-static const size_t utf8_skip_data[256] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
-};
-
#define BLI_STR_UTF8_CPY(dst, src, maxncpy) \
{ \
size_t utf8_size; \
while (*src != '\0' && (utf8_size = utf8_skip_data[*src]) < maxncpy) {\
maxncpy -= utf8_size; \
switch (utf8_size) { \
- case 6: *dst ++ = *src ++; \
- case 5: *dst ++ = *src ++; \
- case 4: *dst ++ = *src ++; \
- case 3: *dst ++ = *src ++; \
- case 2: *dst ++ = *src ++; \
+ case 6: *dst ++ = *src ++; ATTR_FALLTHROUGH; \
+ case 5: *dst ++ = *src ++; ATTR_FALLTHROUGH; \
+ case 4: *dst ++ = *src ++; ATTR_FALLTHROUGH; \
+ case 3: *dst ++ = *src ++; ATTR_FALLTHROUGH; \
+ case 2: *dst ++ = *src ++; ATTR_FALLTHROUGH; \
case 1: *dst ++ = *src ++; \
} \
} \