diff options
Diffstat (limited to 'source/blender/blenlib/intern/string_utf8.c')
-rw-r--r-- | source/blender/blenlib/intern/string_utf8.c | 140 |
1 files changed, 54 insertions, 86 deletions
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c index dbde5221d7e..e35e2bcca3c 100644 --- a/source/blender/blenlib/intern/string_utf8.c +++ b/source/blender/blenlib/intern/string_utf8.c @@ -297,8 +297,8 @@ size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst, const size_t maxncpy) { const size_t maxlen = maxncpy - 1; - /* 6 is max utf8 length of an unicode char. */ - const int64_t maxlen_secured = (int64_t)maxlen - 6; + /* #BLI_UTF8_MAX is max utf8 length of an unicode char. */ + const int64_t maxlen_secured = (int64_t)maxlen - BLI_UTF8_MAX; size_t len = 0; BLI_assert(maxncpy != 0); @@ -314,9 +314,9 @@ size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst, /* We have to be more careful for the last six bytes, * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */ while (*src) { - char t[6]; + char t[BLI_UTF8_MAX]; size_t l = BLI_str_utf8_from_unicode((uint)*src++, t); - BLI_assert(l <= 6); + BLI_assert(l <= BLI_UTF8_MAX); if (len + l > maxlen) { break; } @@ -546,104 +546,63 @@ uint BLI_str_utf8_as_unicode(const char *p) return result; } -/* variant that increments the length */ -uint BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index) +/** + * UTF8 decoding that steps over the index (unless an error is encountered). + * + * \param p: The text to step over. + * \param p_len: The length of `p`. + * \param index: Index of `p` to step over. + * \return the code-point or #BLI_UTF8_ERR if there is a decoding error. + * + * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes) + * must have the same behavior is encountering a nil byte, + * so functions that only use the first part of a string has matching behavior to functions + * that null terminate the text. + */ +uint BLI_str_utf8_as_unicode_step_or_error(const char *__restrict p, + const size_t p_len, + size_t *__restrict index) { int i, len; uint mask = 0; uint result; - const unsigned char c = (unsigned char)*p; + const unsigned char c = (unsigned char)*(p += *index); + + BLI_assert(*index < p_len); + BLI_assert(c != '\0'); UTF8_COMPUTE(c, mask, len, -1); - if (UNLIKELY(len == -1)) { + if (UNLIKELY(len == -1) || (*index + (size_t)len > p_len)) { return BLI_UTF8_ERR; } UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); - *index += (size_t)len; - return result; -} - -uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__restrict index) -{ - int i, len; - uint mask = 0; - uint result; - const unsigned char c = (unsigned char)*p; - - UTF8_COMPUTE(c, mask, len, -1); - if (UNLIKELY(len == -1)) { - *index += 1; - return c; + if (UNLIKELY(result == BLI_UTF8_ERR)) { + return BLI_UTF8_ERR; } - UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); *index += (size_t)len; + BLI_assert(*index <= p_len); return result; } /** - * Another variant that steps over the index. + * UTF8 decoding that steps over the index (unless an error is encountered). * * \param p: The text to step over. * \param p_len: The length of `p`. * \param index: Index of `p` to step over. + * \return the code-point `(p + *index)` if there is a decoding error. * - * \note currently this also falls back to latin1 for text drawing. - * - * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes) - * must have the same behavior is encountering a nil byte, - * so functions that only use the first part of a string has matching behavior to functions - * that null terminate the text. + * \note Falls back to `LATIN1` for text drawing. */ uint BLI_str_utf8_as_unicode_step(const char *__restrict p, const size_t p_len, size_t *__restrict index) { - int i, len; - uint mask = 0; - uint result; - const char c = p[*index]; - - BLI_assert(*index < p_len); - BLI_assert(c != '\0'); - - UTF8_COMPUTE(c, mask, len, -1); - if (UNLIKELY(len == -1)) { - const char *p_next = BLI_str_find_next_char_utf8(p + *index, p + p_len); - /* #BLI_str_find_next_char_utf8 ensures the nil byte will terminate. - * so there is no chance this sets the index past the nil byte (assert this is the case). */ - BLI_assert(p_next || (memchr(p + *index, '\0', p_len - *index) == NULL)); - len = (int)((p_next ? (size_t)(p_next - p) : p_len) - *index); - result = BLI_UTF8_ERR; - } - else if (UNLIKELY(*index + (size_t)len > p_len)) { - /* A multi-byte character reads past the buffer bounds, - * match the behavior of encountering an byte with invalid encoding below. */ - len = 1; - result = (uint)c; - } - else { - /* This is tricky since there are a few ways we can bail out of bad unicode - * values, 3 possible solutions. */ - p += *index; -#if 0 - UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); -#elif 1 - /* WARNING: this is NOT part of glib, or supported by similar functions. - * this is added for text drawing because some filepaths can have latin1 - * characters */ - UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); - if (result == BLI_UTF8_ERR) { - len = 1; - result = (uint)c; - } - /* end warning! */ -#else - /* Without a fallback like '?', text drawing will stop on this value. */ - UTF8_GET(result, p, i, mask, len, '?'); -#endif + uint result = BLI_str_utf8_as_unicode_step_or_error(p, p_len, index); + if (UNLIKELY(result == BLI_UTF8_ERR)) { + result = (uint)p[*index]; + *index += 1; } - - *index += (size_t)len; BLI_assert(*index <= p_len); return result; } @@ -716,16 +675,23 @@ size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w, memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy); #endif - while (*src_c && len != maxlen) { - size_t step = 0; - uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step); + const size_t src_c_len = strlen(src_c); + const char *src_c_end = src_c + src_c_len; + size_t index = 0; + while ((index < src_c_len) && (len != maxlen)) { + const uint unicode = BLI_str_utf8_as_unicode_step_or_error(src_c, src_c_len, &index); if (unicode != BLI_UTF8_ERR) { *dst_w = unicode; - src_c += step; } else { *dst_w = '?'; - src_c = BLI_str_find_next_char_utf8(src_c, NULL); + const char *src_c_next = BLI_str_find_next_char_utf8(src_c + index, src_c_end); + if (src_c_next != NULL) { + index = (size_t)(src_c_next - src_c); + } + else { + index += 1; + } } dst_w++; len++; @@ -741,8 +707,8 @@ size_t BLI_str_utf32_as_utf8(char *__restrict dst, const size_t maxncpy) { const size_t maxlen = maxncpy - 1; - /* 6 is max utf8 length of an unicode char. */ - const int64_t maxlen_secured = (int64_t)maxlen - 6; + /* #BLI_UTF8_MAX is max utf8 length of an unicode char. */ + const int64_t maxlen_secured = (int64_t)maxlen - BLI_UTF8_MAX; size_t len = 0; BLI_assert(maxncpy != 0); @@ -758,9 +724,9 @@ size_t BLI_str_utf32_as_utf8(char *__restrict dst, /* We have to be more careful for the last six bytes, * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */ while (*src) { - char t[6]; + char t[BLI_UTF8_MAX]; size_t l = BLI_str_utf8_from_unicode((uint)*src++, t); - BLI_assert(l <= 6); + BLI_assert(l <= BLI_UTF8_MAX); if (len + l > maxlen) { break; } @@ -905,7 +871,9 @@ size_t BLI_str_partition_ex_utf8(const char *str, index = 0; *sep >= str && (!end || *sep < end) && **sep != '\0'; *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) { - const uint c = BLI_str_utf8_as_unicode_and_size(*sep, &index); + size_t index_ofs = 0; + const uint c = BLI_str_utf8_as_unicode_step_or_error(*sep, (size_t)(end - *sep), &index_ofs); + index += index_ofs; if (c == BLI_UTF8_ERR) { *suf = *sep = NULL; |