1 files changed, 54 insertions, 86 deletions
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index dbde5221d7e..e35e2bcca3c 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -297,8 +297,8 @@ size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst,
                                  const size_t maxncpy)
 {
   const size_t maxlen = maxncpy - 1;
-  /* 6 is max utf8 length of an unicode char. */
-  const int64_t maxlen_secured = (int64_t)maxlen - 6;
+  /* #BLI_UTF8_MAX is max utf8 length of an unicode char. */
+  const int64_t maxlen_secured = (int64_t)maxlen - BLI_UTF8_MAX;
   size_t len = 0;
 
   BLI_assert(maxncpy != 0);
@@ -314,9 +314,9 @@ size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst,
   /* We have to be more careful for the last six bytes,
    * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */
   while (*src) {
-    char t[6];
+    char t[BLI_UTF8_MAX];
     size_t l = BLI_str_utf8_from_unicode((uint)*src++, t);
-    BLI_assert(l <= 6);
+    BLI_assert(l <= BLI_UTF8_MAX);
     if (len + l > maxlen) {
       break;
     }
@@ -546,104 +546,63 @@ uint BLI_str_utf8_as_unicode(const char *p)
   return result;
 }
 
-/* variant that increments the length */
-uint BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index)
+/**
+ * UTF8 decoding that steps over the index (unless an error is encountered).
+ *
+ * \param p: The text to step over.
+ * \param p_len: The length of `p`.
+ * \param index: Index of `p` to step over.
+ * \return the code-point or #BLI_UTF8_ERR if there is a decoding error.
+ *
+ * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes)
+ * must have the same behavior is encountering a nil byte,
+ * so functions that only use the first part of a string has matching behavior to functions
+ * that null terminate the text.
+ */
+uint BLI_str_utf8_as_unicode_step_or_error(const char *__restrict p,
+                                           const size_t p_len,
+                                           size_t *__restrict index)
 {
   int i, len;
   uint mask = 0;
   uint result;
-  const unsigned char c = (unsigned char)*p;
+  const unsigned char c = (unsigned char)*(p += *index);
+
+  BLI_assert(*index < p_len);
+  BLI_assert(c != '\0');
 
   UTF8_COMPUTE(c, mask, len, -1);
-  if (UNLIKELY(len == -1)) {
+  if (UNLIKELY(len == -1) || (*index + (size_t)len > p_len)) {
     return BLI_UTF8_ERR;
   }
   UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-  *index += (size_t)len;
-  return result;
-}
-
-uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__restrict index)
-{
-  int i, len;
-  uint mask = 0;
-  uint result;
-  const unsigned char c = (unsigned char)*p;
-
-  UTF8_COMPUTE(c, mask, len, -1);
-  if (UNLIKELY(len == -1)) {
-    *index += 1;
-    return c;
+  if (UNLIKELY(result == BLI_UTF8_ERR)) {
+    return BLI_UTF8_ERR;
   }
-  UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
   *index += (size_t)len;
+  BLI_assert(*index <= p_len);
   return result;
 }
 
 /**
- * Another variant that steps over the index.
+ * UTF8 decoding that steps over the index (unless an error is encountered).
  *
  * \param p: The text to step over.
  * \param p_len: The length of `p`.
  * \param index: Index of `p` to step over.
+ * \return the code-point `(p + *index)` if there is a decoding error.
  *
- * \note currently this also falls back to latin1 for text drawing.
- *
- * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes)
- * must have the same behavior is encountering a nil byte,
- * so functions that only use the first part of a string has matching behavior to functions
- * that null terminate the text.
+ * \note Falls back to `LATIN1` for text drawing.
  */
 uint BLI_str_utf8_as_unicode_step(const char *__restrict p,
                                   const size_t p_len,
                                   size_t *__restrict index)
 {
-  int i, len;
-  uint mask = 0;
-  uint result;
-  const char c = p[*index];
-
-  BLI_assert(*index < p_len);
-  BLI_assert(c != '\0');
-
-  UTF8_COMPUTE(c, mask, len, -1);
-  if (UNLIKELY(len == -1)) {
-    const char *p_next = BLI_str_find_next_char_utf8(p + *index, p + p_len);
-    /* #BLI_str_find_next_char_utf8 ensures the nil byte will terminate.
-     * so there is no chance this sets the index past the nil byte (assert this is the case). */
-    BLI_assert(p_next || (memchr(p + *index, '\0', p_len - *index) == NULL));
-    len = (int)((p_next ? (size_t)(p_next - p) : p_len) - *index);
-    result = BLI_UTF8_ERR;
-  }
-  else if (UNLIKELY(*index + (size_t)len > p_len)) {
-    /* A multi-byte character reads past the buffer bounds,
-     * match the behavior of encountering an byte with invalid encoding below. */
-    len = 1;
-    result = (uint)c;
-  }
-  else {
-    /* This is tricky since there are a few ways we can bail out of bad unicode
-     * values, 3 possible solutions. */
-    p += *index;
-#if 0
-    UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-#elif 1
-    /* WARNING: this is NOT part of glib, or supported by similar functions.
-     * this is added for text drawing because some filepaths can have latin1
-     * characters */
-    UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-    if (result == BLI_UTF8_ERR) {
-      len = 1;
-      result = (uint)c;
-    }
-    /* end warning! */
-#else
-    /* Without a fallback like '?', text drawing will stop on this value. */
-    UTF8_GET(result, p, i, mask, len, '?');
-#endif
+  uint result = BLI_str_utf8_as_unicode_step_or_error(p, p_len, index);
+  if (UNLIKELY(result == BLI_UTF8_ERR)) {
+    result = (uint)p[*index];
+    *index += 1;
   }
-
-  *index += (size_t)len;
   BLI_assert(*index <= p_len);
   return result;
 }
@@ -716,16 +675,23 @@ size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
   memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy);
 #endif
 
-  while (*src_c && len != maxlen) {
-    size_t step = 0;
-    uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step);
+  const size_t src_c_len = strlen(src_c);
+  const char *src_c_end = src_c + src_c_len;
+  size_t index = 0;
+  while ((index < src_c_len) && (len != maxlen)) {
+    const uint unicode = BLI_str_utf8_as_unicode_step_or_error(src_c, src_c_len, &index);
     if (unicode != BLI_UTF8_ERR) {
       *dst_w = unicode;
-      src_c += step;
     }
     else {
       *dst_w = '?';
-      src_c = BLI_str_find_next_char_utf8(src_c, NULL);
+      const char *src_c_next = BLI_str_find_next_char_utf8(src_c + index, src_c_end);
+      if (src_c_next != NULL) {
+        index = (size_t)(src_c_next - src_c);
+      }
+      else {
+        index += 1;
+      }
     }
     dst_w++;
     len++;
@@ -741,8 +707,8 @@ size_t BLI_str_utf32_as_utf8(char *__restrict dst,
                              const size_t maxncpy)
 {
   const size_t maxlen = maxncpy - 1;
-  /* 6 is max utf8 length of an unicode char. */
-  const int64_t maxlen_secured = (int64_t)maxlen - 6;
+  /* #BLI_UTF8_MAX is max utf8 length of an unicode char. */
+  const int64_t maxlen_secured = (int64_t)maxlen - BLI_UTF8_MAX;
   size_t len = 0;
 
   BLI_assert(maxncpy != 0);
@@ -758,9 +724,9 @@ size_t BLI_str_utf32_as_utf8(char *__restrict dst,
   /* We have to be more careful for the last six bytes,
    * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */
   while (*src) {
-    char t[6];
+    char t[BLI_UTF8_MAX];
     size_t l = BLI_str_utf8_from_unicode((uint)*src++, t);
-    BLI_assert(l <= 6);
+    BLI_assert(l <= BLI_UTF8_MAX);
     if (len + l > maxlen) {
       break;
     }
@@ -905,7 +871,9 @@ size_t BLI_str_partition_ex_utf8(const char *str,
       index = 0;
        *sep >= str && (!end || *sep < end) && **sep != '\0';
        *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) {
-    const uint c = BLI_str_utf8_as_unicode_and_size(*sep, &index);
+    size_t index_ofs = 0;
+    const uint c = BLI_str_utf8_as_unicode_step_or_error(*sep, (size_t)(end - *sep), &index_ofs);
+    index += index_ofs;
 
     if (c == BLI_UTF8_ERR) {
       *suf = *sep = NULL;