From be906f44c6bb51eb492ecb90dbc1e8e0bc01d1ec Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Wed, 25 Aug 2021 15:18:57 +1000 Subject: BLI_string_utf8: simplify utf8 stepping logic There were multiple utf8 functions which treated errors slightly differently. Split BLI_str_utf8_as_unicode_step into two functions. - BLI_str_utf8_as_unicode_step_or_error returns error value when decoding fails and doesn't step. - BLI_str_utf8_as_unicode_step always steps forward at least one returning the byte value without decoding (needed to display some latin1 file-paths). Font drawing uses BLI_str_utf8_as_unicode_step and no longer check for error values. --- source/blender/blenfont/intern/blf_font.c | 9 ++-- source/blender/blenlib/BLI_string_utf8.h | 3 ++ source/blender/blenlib/intern/string_utf8.c | 75 +++++++++++++---------------- 3 files changed, 41 insertions(+), 46 deletions(-) diff --git a/source/blender/blenfont/intern/blf_font.c b/source/blender/blenfont/intern/blf_font.c index 5ad48aa08d4..426008c9395 100644 --- a/source/blender/blenfont/intern/blf_font.c +++ b/source/blender/blenfont/intern/blf_font.c @@ -309,15 +309,13 @@ BLI_INLINE GlyphBLF *blf_utf8_next_fast( } (*i_p)++; } - else if ((*r_c = BLI_str_utf8_as_unicode_step(str, str_len, i_p)) != BLI_UTF8_ERR) { + else { + *r_c = BLI_str_utf8_as_unicode_step(str, str_len, i_p); g = blf_glyph_search(gc, *r_c); if (UNLIKELY(g == NULL)) { g = blf_glyph_add(font, gc, FT_Get_Char_Index(font->face, *r_c), *r_c); } } - else { - g = NULL; - } return g; } @@ -1202,7 +1200,8 @@ int blf_font_count_missing_chars(FontBLF *font, if ((c = str[i]) < GLYPH_ASCII_TABLE_SIZE) { i++; } - else if ((c = BLI_str_utf8_as_unicode_step(str, str_len, &i)) != BLI_UTF8_ERR) { + else { + c = BLI_str_utf8_as_unicode_step(str, str_len, &i); if (FT_Get_Char_Index((font)->face, c) == 0) { missing++; } diff --git a/source/blender/blenlib/BLI_string_utf8.h b/source/blender/blenlib/BLI_string_utf8.h index b936e39731d..1b12147fe0f 100644 --- a/source/blender/blenlib/BLI_string_utf8.h +++ b/source/blender/blenlib/BLI_string_utf8.h @@ -46,6 +46,9 @@ unsigned int BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t p_len, size_t *__restrict index) ATTR_NONNULL(1, 3); +unsigned int BLI_str_utf8_as_unicode_step_or_error(const char *__restrict p, + size_t p_len, + size_t *__restrict index) ATTR_NONNULL(1, 3); size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf); size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w, diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c index dbde5221d7e..06fd3168c24 100644 --- a/source/blender/blenlib/intern/string_utf8.c +++ b/source/blender/blenlib/intern/string_utf8.c @@ -581,73 +581,66 @@ uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__r } /** - * Another variant that steps over the index. + * UTF8 decoding that steps over the index (unless an error is encountered). * * \param p: The text to step over. * \param p_len: The length of `p`. * \param index: Index of `p` to step over. - * - * \note currently this also falls back to latin1 for text drawing. + * \return the code-point or #BLI_UTF8_ERR if there is a decoding error. * * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes) * must have the same behavior is encountering a nil byte, * so functions that only use the first part of a string has matching behavior to functions * that null terminate the text. */ -uint BLI_str_utf8_as_unicode_step(const char *__restrict p, - const size_t p_len, - size_t *__restrict index) +uint BLI_str_utf8_as_unicode_step_or_error(const char *__restrict p, + const size_t p_len, + size_t *__restrict index) { int i, len; uint mask = 0; uint result; - const char c = p[*index]; + const unsigned char c = (unsigned char)*(p += *index); BLI_assert(*index < p_len); BLI_assert(c != '\0'); UTF8_COMPUTE(c, mask, len, -1); - if (UNLIKELY(len == -1)) { - const char *p_next = BLI_str_find_next_char_utf8(p + *index, p + p_len); - /* #BLI_str_find_next_char_utf8 ensures the nil byte will terminate. - * so there is no chance this sets the index past the nil byte (assert this is the case). */ - BLI_assert(p_next || (memchr(p + *index, '\0', p_len - *index) == NULL)); - len = (int)((p_next ? (size_t)(p_next - p) : p_len) - *index); - result = BLI_UTF8_ERR; - } - else if (UNLIKELY(*index + (size_t)len > p_len)) { - /* A multi-byte character reads past the buffer bounds, - * match the behavior of encountering an byte with invalid encoding below. */ - len = 1; - result = (uint)c; + if (UNLIKELY(len == -1) || (*index + (size_t)len > p_len)) { + return BLI_UTF8_ERR; } - else { - /* This is tricky since there are a few ways we can bail out of bad unicode - * values, 3 possible solutions. */ - p += *index; -#if 0 - UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); -#elif 1 - /* WARNING: this is NOT part of glib, or supported by similar functions. - * this is added for text drawing because some filepaths can have latin1 - * characters */ - UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); - if (result == BLI_UTF8_ERR) { - len = 1; - result = (uint)c; - } - /* end warning! */ -#else - /* Without a fallback like '?', text drawing will stop on this value. */ - UTF8_GET(result, p, i, mask, len, '?'); -#endif + UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); + if (UNLIKELY(result == BLI_UTF8_ERR)) { + return BLI_UTF8_ERR; } - *index += (size_t)len; BLI_assert(*index <= p_len); return result; } +/** + * UTF8 decoding that steps over the index (unless an error is encountered). + * + * \param p: The text to step over. + * \param p_len: The length of `p`. + * \param index: Index of `p` to step over. + * \return the code-point `(p + *index)` if there is a decoding error. + * + * \note Falls back to `LATIN1` for text drawing. + */ +uint BLI_str_utf8_as_unicode_step(const char *__restrict p, + const size_t p_len, + size_t *__restrict index) +{ + uint result = BLI_str_utf8_as_unicode_step_or_error(p, p_len, index); + if (UNLIKELY(result == BLI_UTF8_ERR)) { + result = (uint)p[*index]; + *index += 1; + } + BLI_assert(*index <= p_len); + return result; +} + /* was g_unichar_to_utf8 */ /** * BLI_str_utf8_from_unicode: -- cgit v1.2.3