From be906f44c6bb51eb492ecb90dbc1e8e0bc01d1ec Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Wed, 25 Aug 2021 15:18:57 +1000
Subject: BLI_string_utf8: simplify utf8 stepping logic

There were multiple utf8 functions which treated
errors slightly differently.

Split BLI_str_utf8_as_unicode_step into two functions.

- BLI_str_utf8_as_unicode_step_or_error returns error value
  when decoding fails and doesn't step.

- BLI_str_utf8_as_unicode_step always steps forward at least one
  returning the byte value without decoding
  (needed to display some latin1 file-paths).

Font drawing uses BLI_str_utf8_as_unicode_step and no longer
check for error values.
---
 source/blender/blenfont/intern/blf_font.c   |  9 ++--
 source/blender/blenlib/BLI_string_utf8.h    |  3 ++
 source/blender/blenlib/intern/string_utf8.c | 75 +++++++++++++----------------
 3 files changed, 41 insertions(+), 46 deletions(-)

diff --git a/source/blender/blenfont/intern/blf_font.c b/source/blender/blenfont/intern/blf_font.c
index 5ad48aa08d4..426008c9395 100644
--- a/source/blender/blenfont/intern/blf_font.c
+++ b/source/blender/blenfont/intern/blf_font.c
@@ -309,15 +309,13 @@ BLI_INLINE GlyphBLF *blf_utf8_next_fast(
     }
     (*i_p)++;
   }
-  else if ((*r_c = BLI_str_utf8_as_unicode_step(str, str_len, i_p)) != BLI_UTF8_ERR) {
+  else {
+    *r_c = BLI_str_utf8_as_unicode_step(str, str_len, i_p);
     g = blf_glyph_search(gc, *r_c);
     if (UNLIKELY(g == NULL)) {
       g = blf_glyph_add(font, gc, FT_Get_Char_Index(font->face, *r_c), *r_c);
     }
   }
-  else {
-    g = NULL;
-  }
   return g;
 }
 
@@ -1202,7 +1200,8 @@ int blf_font_count_missing_chars(FontBLF *font,
     if ((c = str[i]) < GLYPH_ASCII_TABLE_SIZE) {
       i++;
     }
-    else if ((c = BLI_str_utf8_as_unicode_step(str, str_len, &i)) != BLI_UTF8_ERR) {
+    else {
+      c = BLI_str_utf8_as_unicode_step(str, str_len, &i);
       if (FT_Get_Char_Index((font)->face, c) == 0) {
         missing++;
       }
diff --git a/source/blender/blenlib/BLI_string_utf8.h b/source/blender/blenlib/BLI_string_utf8.h
index b936e39731d..1b12147fe0f 100644
--- a/source/blender/blenlib/BLI_string_utf8.h
+++ b/source/blender/blenlib/BLI_string_utf8.h
@@ -46,6 +46,9 @@ unsigned int BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p,
 unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p,
                                           size_t p_len,
                                           size_t *__restrict index) ATTR_NONNULL(1, 3);
+unsigned int BLI_str_utf8_as_unicode_step_or_error(const char *__restrict p,
+                                                   size_t p_len,
+                                                   size_t *__restrict index) ATTR_NONNULL(1, 3);
 
 size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf);
 size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index dbde5221d7e..06fd3168c24 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -581,73 +581,66 @@ uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__r
 }
 
 /**
- * Another variant that steps over the index.
+ * UTF8 decoding that steps over the index (unless an error is encountered).
  *
  * \param p: The text to step over.
  * \param p_len: The length of `p`.
  * \param index: Index of `p` to step over.
- *
- * \note currently this also falls back to latin1 for text drawing.
+ * \return the code-point or #BLI_UTF8_ERR if there is a decoding error.
  *
  * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes)
  * must have the same behavior is encountering a nil byte,
  * so functions that only use the first part of a string has matching behavior to functions
  * that null terminate the text.
  */
-uint BLI_str_utf8_as_unicode_step(const char *__restrict p,
-                                  const size_t p_len,
-                                  size_t *__restrict index)
+uint BLI_str_utf8_as_unicode_step_or_error(const char *__restrict p,
+                                           const size_t p_len,
+                                           size_t *__restrict index)
 {
   int i, len;
   uint mask = 0;
   uint result;
-  const char c = p[*index];
+  const unsigned char c = (unsigned char)*(p += *index);
 
   BLI_assert(*index < p_len);
   BLI_assert(c != '\0');
 
   UTF8_COMPUTE(c, mask, len, -1);
-  if (UNLIKELY(len == -1)) {
-    const char *p_next = BLI_str_find_next_char_utf8(p + *index, p + p_len);
-    /* #BLI_str_find_next_char_utf8 ensures the nil byte will terminate.
-     * so there is no chance this sets the index past the nil byte (assert this is the case). */
-    BLI_assert(p_next || (memchr(p + *index, '\0', p_len - *index) == NULL));
-    len = (int)((p_next ? (size_t)(p_next - p) : p_len) - *index);
-    result = BLI_UTF8_ERR;
-  }
-  else if (UNLIKELY(*index + (size_t)len > p_len)) {
-    /* A multi-byte character reads past the buffer bounds,
-     * match the behavior of encountering an byte with invalid encoding below. */
-    len = 1;
-    result = (uint)c;
+  if (UNLIKELY(len == -1) || (*index + (size_t)len > p_len)) {
+    return BLI_UTF8_ERR;
   }
-  else {
-    /* This is tricky since there are a few ways we can bail out of bad unicode
-     * values, 3 possible solutions. */
-    p += *index;
-#if 0
-    UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-#elif 1
-    /* WARNING: this is NOT part of glib, or supported by similar functions.
-     * this is added for text drawing because some filepaths can have latin1
-     * characters */
-    UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-    if (result == BLI_UTF8_ERR) {
-      len = 1;
-      result = (uint)c;
-    }
-    /* end warning! */
-#else
-    /* Without a fallback like '?', text drawing will stop on this value. */
-    UTF8_GET(result, p, i, mask, len, '?');
-#endif
+  UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
+  if (UNLIKELY(result == BLI_UTF8_ERR)) {
+    return BLI_UTF8_ERR;
   }
-
   *index += (size_t)len;
   BLI_assert(*index <= p_len);
   return result;
 }
 
+/**
+ * UTF8 decoding that steps over the index (unless an error is encountered).
+ *
+ * \param p: The text to step over.
+ * \param p_len: The length of `p`.
+ * \param index: Index of `p` to step over.
+ * \return the code-point `(p + *index)` if there is a decoding error.
+ *
+ * \note Falls back to `LATIN1` for text drawing.
+ */
+uint BLI_str_utf8_as_unicode_step(const char *__restrict p,
+                                  const size_t p_len,
+                                  size_t *__restrict index)
+{
+  uint result = BLI_str_utf8_as_unicode_step_or_error(p, p_len, index);
+  if (UNLIKELY(result == BLI_UTF8_ERR)) {
+    result = (uint)p[*index];
+    *index += 1;
+  }
+  BLI_assert(*index <= p_len);
+  return result;
+}
+
 /* was g_unichar_to_utf8 */
 /**
  * BLI_str_utf8_from_unicode:
-- 
cgit v1.2.3