diff options
Diffstat (limited to 'source/blender/blenlib/intern/string_utf8.c')
-rw-r--r-- | source/blender/blenlib/intern/string_utf8.c | 163 |
1 files changed, 93 insertions, 70 deletions
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c index 96033615cf5..229a97a2fa7 100644 --- a/source/blender/blenlib/intern/string_utf8.c +++ b/source/blender/blenlib/intern/string_utf8.c @@ -47,6 +47,19 @@ // #define DEBUG_STRSIZE +/* array copied from glib's gutf8.c, */ +/* Note: last two values (0xfe and 0xff) are forbidden in utf-8, so they are considered 1 byte length too. */ +static const size_t utf8_skip_data[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 +}; + /* from libswish3, originally called u8_isvalid(), * modified to return the index of the bad character (byte index not utf). * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */ @@ -56,73 +69,91 @@ * length is in bytes, since without knowing whether the string is valid * it's hard to know how many characters there are! */ -static const char trailingBytesForUTF8[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 -}; - -int BLI_utf8_invalid_byte(const char *str, int length) +/** + * Find first utf-8 invalid byte in given \a str, of \a length bytes. + * + * \return the offset of the first invalid byte. + */ +ptrdiff_t BLI_utf8_invalid_byte(const char *str, size_t length) { - const unsigned char *p, *pend = (const unsigned char *)str + length; + const unsigned char *p, *perr, *pend = (const unsigned char *)str + length; unsigned char c; int ab; - for (p = (const unsigned char *)str; p < pend; p++) { + for (p = (const unsigned char *)str; p < pend; p++, length--) { c = *p; + perr = p; /* Erroneous char is always the first of an invalid utf8 sequence... */ + if (ELEM(c, 0xfe, 0xff, 0x00)) /* Those three values are not allowed in utf8 string. */ + goto utf8_error; if (c < 128) continue; if ((c & 0xc0) != 0xc0) goto utf8_error; - ab = trailingBytesForUTF8[c]; - if (length < ab) + + /* Note that since we always increase p (and decrease length) by one byte in main loop, we only add/subtract + * extra utf8 bytes in code below + * (ab number, aka number of bytes remaining in the utf8 sequence after the initial one). */ + ab = (int)utf8_skip_data[c] - 1; + if (length <= ab) { goto utf8_error; - length -= ab; + } - p++; /* Check top bits in the second byte */ + p++; + length--; if ((*p & 0xc0) != 0x80) goto utf8_error; /* Check for overlong sequences for each different length */ switch (ab) { - /* Check for xx00 000x */ - case 1: - if ((c & 0x3e) == 0) goto utf8_error; - continue; /* We know there aren't any more bytes to check */ - - /* Check for 1110 0000, xx0x xxxx */ - case 2: - if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error; - break; - - /* Check for 1111 0000, xx00 xxxx */ - case 3: - if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error; - break; - - /* Check for 1111 1000, xx00 0xxx */ - case 4: - if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error; - break; - - /* Check for leading 0xfe or 0xff, - * and then for 1111 1100, xx00 00xx */ - case 5: - if (c == 0xfe || c == 0xff || - (c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error; - break; + case 1: + /* Check for xx00 000x */ + if ((c & 0x3e) == 0) goto utf8_error; + continue; /* We know there aren't any more bytes to check */ + + case 2: + /* Check for 1110 0000, xx0x xxxx */ + if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error; + /* Some special cases, see section 5 of utf-8 decoder stress-test by Markus Kuhn + * (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt). */ + /* From section 5.1 (and 5.2) */ + if (c == 0xed) { + if (*p == 0xa0 && *(p + 1) == 0x80) goto utf8_error; + if (*p == 0xad && *(p + 1) == 0xbf) goto utf8_error; + if (*p == 0xae && *(p + 1) == 0x80) goto utf8_error; + if (*p == 0xaf && *(p + 1) == 0xbf) goto utf8_error; + if (*p == 0xb0 && *(p + 1) == 0x80) goto utf8_error; + if (*p == 0xbe && *(p + 1) == 0x80) goto utf8_error; + if (*p == 0xbf && *(p + 1) == 0xbf) goto utf8_error; + } + /* From section 5.3 */ + if (c == 0xef) { + if (*p == 0xbf && *(p + 1) == 0xbe) goto utf8_error; + if (*p == 0xbf && *(p + 1) == 0xbf) goto utf8_error; + } + break; + + case 3: + /* Check for 1111 0000, xx00 xxxx */ + if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error; + break; + + case 4: + /* Check for 1111 1000, xx00 0xxx */ + if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error; + break; + + case 5: + /* Check for 1111 1100, xx00 00xx */ + if (c == 0xfc && (*p & 0x3c) == 0) goto utf8_error; + break; } /* Check for valid bytes after the 2nd, if any; all must start 10 */ while (--ab > 0) { - if ((*(p + 1) & 0xc0) != 0x80) goto utf8_error; - p++; /* do this after so we get usable offset - campbell */ + p++; + length--; + if ((*p & 0xc0) != 0x80) goto utf8_error; } } @@ -130,18 +161,24 @@ int BLI_utf8_invalid_byte(const char *str, int length) utf8_error: - return (int)((const char *)p - (const char *)str) - 1; + return ((const char *)perr - (const char *)str); } -int BLI_utf8_invalid_strip(char *str, int length) +/** + * Remove any invalid utf-8 byte (taking into account multi-bytes sequence of course). + * + * \return number of stripped bytes. + */ +int BLI_utf8_invalid_strip(char *str, size_t length) { - int bad_char, tot = 0; + ptrdiff_t bad_char; + int tot = 0; BLI_assert(str[length] == '\0'); while ((bad_char = BLI_utf8_invalid_byte(str, length)) != -1) { str += bad_char; - length -= bad_char; + length -= (size_t)(bad_char + 1); if (length == 0) { /* last character bad, strip it */ @@ -151,7 +188,7 @@ int BLI_utf8_invalid_strip(char *str, int length) } else { /* strip, keep looking */ - memmove(str, str + 1, (size_t)length); + memmove(str, str + 1, length + 1); /* +1 for NULL char! */ tot++; } } @@ -162,31 +199,17 @@ int BLI_utf8_invalid_strip(char *str, int length) /* compatible with BLI_strncpy, but esnure no partial utf8 chars */ -/* array copied from glib's gutf8.c, - * note: this looks to be at odd's with 'trailingBytesForUTF8', - * need to find out what gives here! - campbell */ -static const size_t utf8_skip_data[256] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 -}; - #define BLI_STR_UTF8_CPY(dst, src, maxncpy) \ { \ size_t utf8_size; \ while (*src != '\0' && (utf8_size = utf8_skip_data[*src]) < maxncpy) {\ maxncpy -= utf8_size; \ switch (utf8_size) { \ - case 6: *dst ++ = *src ++; \ - case 5: *dst ++ = *src ++; \ - case 4: *dst ++ = *src ++; \ - case 3: *dst ++ = *src ++; \ - case 2: *dst ++ = *src ++; \ + case 6: *dst ++ = *src ++; ATTR_FALLTHROUGH; \ + case 5: *dst ++ = *src ++; ATTR_FALLTHROUGH; \ + case 4: *dst ++ = *src ++; ATTR_FALLTHROUGH; \ + case 3: *dst ++ = *src ++; ATTR_FALLTHROUGH; \ + case 2: *dst ++ = *src ++; ATTR_FALLTHROUGH; \ case 1: *dst ++ = *src ++; \ } \ } \ |