6 files changed, 179 insertions, 46 deletions
diff --git a/source/blender/blenfont/intern/blf_font.c b/source/blender/blenfont/intern/blf_font.c
index 50b4bb09b7b..5ad48aa08d4 100644
--- a/source/blender/blenfont/intern/blf_font.c
+++ b/source/blender/blenfont/intern/blf_font.c
@@ -298,7 +298,7 @@ static void blf_batch_draw_end(void)
  */
 
 BLI_INLINE GlyphBLF *blf_utf8_next_fast(
-    FontBLF *font, GlyphCacheBLF *gc, const char *str, size_t *i_p, uint *r_c)
+    FontBLF *font, GlyphCacheBLF *gc, const char *str, size_t str_len, size_t *i_p, uint *r_c)
 {
   GlyphBLF *g;
   if ((*r_c = str[*i_p]) < GLYPH_ASCII_TABLE_SIZE) {
@@ -309,7 +309,7 @@ BLI_INLINE GlyphBLF *blf_utf8_next_fast(
     }
     (*i_p)++;
   }
-  else if ((*r_c = BLI_str_utf8_as_unicode_step(str, i_p)) != BLI_UTF8_ERR) {
+  else if ((*r_c = BLI_str_utf8_as_unicode_step(str, str_len, i_p)) != BLI_UTF8_ERR) {
     g = blf_glyph_search(gc, *r_c);
     if (UNLIKELY(g == NULL)) {
       g = blf_glyph_add(font, gc, FT_Get_Char_Index(font->face, *r_c), *r_c);
@@ -382,7 +382,7 @@ static void blf_font_draw_ex(FontBLF *font,
   blf_batch_draw_begin(font);
 
   while ((i < str_len) && str[i]) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -478,7 +478,7 @@ int blf_font_draw_mono(FontBLF *font, const char *str, const size_t str_len, int
   blf_batch_draw_begin(font);
 
   while ((i < str_len) && str[i]) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -535,7 +535,7 @@ static void blf_font_draw_buffer_ex(FontBLF *font,
   /* another buffer specific call for color conversion */
 
   while ((i < str_len) && str[i]) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -703,7 +703,7 @@ size_t blf_font_width_to_strlen(
 
   for (i_prev = i = 0, width_new = pen_x = 0, g_prev = NULL, c_prev = 0; (i < str_len) && str[i];
        i_prev = i, width_new = pen_x, c_prev = c, g_prev = g) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (blf_font_width_to_strlen_glyph_process(font, c_prev, c, g_prev, g, &pen_x, width_i)) {
       break;
@@ -737,7 +737,7 @@ size_t blf_font_width_to_rstrlen(
   i_prev = (size_t)((s_prev != NULL) ? s_prev - str : 0);
 
   i_tmp = i;
-  g = blf_utf8_next_fast(font, gc, str, &i_tmp, &c);
+  g = blf_utf8_next_fast(font, gc, str, str_len, &i_tmp, &c);
   for (width_new = pen_x = 0; (s != NULL);
        i = i_prev, s = s_prev, c = c_prev, g = g_prev, g_prev = NULL, width_new = pen_x) {
     s_prev = BLI_str_find_prev_char_utf8(str, s);
@@ -745,7 +745,7 @@ size_t blf_font_width_to_rstrlen(
 
     if (s_prev != NULL) {
       i_tmp = i_prev;
-      g_prev = blf_utf8_next_fast(font, gc, str, &i_tmp, &c_prev);
+      g_prev = blf_utf8_next_fast(font, gc, str, str_len, &i_tmp, &c_prev);
       BLI_assert(i_tmp == i);
     }
 
@@ -788,7 +788,7 @@ static void blf_font_boundbox_ex(FontBLF *font,
   box->ymax = -32000.0f;
 
   while ((i < str_len) && str[i]) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -961,7 +961,7 @@ static void blf_font_boundbox_foreach_glyph_ex(FontBLF *font,
 
   while ((i < str_len) && str[i]) {
     i_curr = i;
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -1051,7 +1051,7 @@ static void blf_font_wrap_apply(FontBLF *font,
     size_t i_curr = i;
     bool do_draw = false;
 
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -1202,7 +1202,7 @@ int blf_font_count_missing_chars(FontBLF *font,
     if ((c = str[i]) < GLYPH_ASCII_TABLE_SIZE) {
       i++;
     }
-    else if ((c = BLI_str_utf8_as_unicode_step(str, &i)) != BLI_UTF8_ERR) {
+    else if ((c = BLI_str_utf8_as_unicode_step(str, str_len, &i)) != BLI_UTF8_ERR) {
       if (FT_Get_Char_Index((font)->face, c) == 0) {
         missing++;
       }
diff --git a/source/blender/blenkernel/intern/text.c b/source/blender/blenkernel/intern/text.c
index 06137f5d110..c2ab91251b6 100644
--- a/source/blender/blenkernel/intern/text.c
+++ b/source/blender/blenkernel/intern/text.c
@@ -1660,7 +1660,7 @@ void txt_insert_buf(Text *text, const char *in_buffer)
 
   /* Read the first line (or as close as possible */
   while (buffer[i] && buffer[i] != '\n') {
-    txt_add_raw_char(text, BLI_str_utf8_as_unicode_step(buffer, &i));
+    txt_add_raw_char(text, BLI_str_utf8_as_unicode_step(buffer, len, &i));
   }
 
   if (buffer[i] == '\n') {
@@ -1682,7 +1682,7 @@ void txt_insert_buf(Text *text, const char *in_buffer)
       }
       else {
         for (j = i - l; j < i && j < len;) {
-          txt_add_raw_char(text, BLI_str_utf8_as_unicode_step(buffer, &j));
+          txt_add_raw_char(text, BLI_str_utf8_as_unicode_step(buffer, len, &j));
         }
         break;
       }
diff --git a/source/blender/blenlib/BLI_string_utf8.h b/source/blender/blenlib/BLI_string_utf8.h
index e1d7e2c58f7..b936e39731d 100644
--- a/source/blender/blenlib/BLI_string_utf8.h
+++ b/source/blender/blenlib/BLI_string_utf8.h
@@ -43,8 +43,10 @@ unsigned int BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *
     ATTR_NONNULL();
 unsigned int BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p,
                                                    size_t *__restrict index) ATTR_NONNULL();
-unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t *__restrict index)
-    ATTR_NONNULL();
+unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p,
+                                          size_t p_len,
+                                          size_t *__restrict index) ATTR_NONNULL(1, 3);
+
 size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf);
 size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
                              const char *__restrict src_c,
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index 5710bd6b150..dbde5221d7e 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -582,49 +582,69 @@ uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__r
 
 /**
  * Another variant that steps over the index.
+ *
+ * \param p: The text to step over.
+ * \param p_len: The length of `p`.
+ * \param index: Index of `p` to step over.
+ *
  * \note currently this also falls back to latin1 for text drawing.
+ *
+ * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes)
+ * must have the same behavior is encountering a nil byte,
+ * so functions that only use the first part of a string has matching behavior to functions
+ * that null terminate the text.
  */
-uint BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t *__restrict index)
+uint BLI_str_utf8_as_unicode_step(const char *__restrict p,
+                                  const size_t p_len,
+                                  size_t *__restrict index)
 {
   int i, len;
   uint mask = 0;
   uint result;
-  unsigned char c;
+  const char c = p[*index];
 
-  p += *index;
-  c = (unsigned char)*p;
+  BLI_assert(*index < p_len);
+  BLI_assert(c != '\0');
 
   UTF8_COMPUTE(c, mask, len, -1);
   if (UNLIKELY(len == -1)) {
-    /* when called with NULL end, result will never be NULL,
-     * checks for a NULL character */
-    const char *p_next = BLI_str_find_next_char_utf8(p, NULL);
-    /* will never return the same pointer unless '\0',
-     * eternal loop is prevented */
-    *index += (size_t)(p_next - p);
-    return BLI_UTF8_ERR;
+    const char *p_next = BLI_str_find_next_char_utf8(p + *index, p + p_len);
+    /* #BLI_str_find_next_char_utf8 ensures the nil byte will terminate.
+     * so there is no chance this sets the index past the nil byte (assert this is the case). */
+    BLI_assert(p_next || (memchr(p + *index, '\0', p_len - *index) == NULL));
+    len = (int)((p_next ? (size_t)(p_next - p) : p_len) - *index);
+    result = BLI_UTF8_ERR;
+  }
+  else if (UNLIKELY(*index + (size_t)len > p_len)) {
+    /* A multi-byte character reads past the buffer bounds,
+     * match the behavior of encountering an byte with invalid encoding below. */
+    len = 1;
+    result = (uint)c;
   }
-
-  /* this is tricky since there are a few ways we can bail out of bad unicode
-   * values, 3 possible solutions. */
+  else {
+    /* This is tricky since there are a few ways we can bail out of bad unicode
+     * values, 3 possible solutions. */
+    p += *index;
 #if 0
-  UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
+    UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
 #elif 1
-  /* WARNING: this is NOT part of glib, or supported by similar functions.
-   * this is added for text drawing because some filepaths can have latin1
-   * characters */
-  UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-  if (result == BLI_UTF8_ERR) {
-    len = 1;
-    result = *p;
-  }
-  /* end warning! */
+    /* WARNING: this is NOT part of glib, or supported by similar functions.
+     * this is added for text drawing because some filepaths can have latin1
+     * characters */
+    UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
+    if (result == BLI_UTF8_ERR) {
+      len = 1;
+      result = (uint)c;
+    }
+    /* end warning! */
 #else
-  /* without a fallback like '?', text drawing will stop on this value */
-  UTF8_GET(result, p, i, mask, len, '?');
+    /* Without a fallback like '?', text drawing will stop on this value. */
+    UTF8_GET(result, p, i, mask, len, '?');
 #endif
+  }
 
   *index += (size_t)len;
+  BLI_assert(*index <= p_len);
   return result;
 }
 
@@ -810,6 +830,7 @@ char *BLI_str_find_next_char_utf8(const char *p, const char *end)
 {
   if (*p) {
     if (end) {
+      BLI_assert(end >= p);
       for (++p; p < end && (*p & 0xc0) == 0x80; p++) {
         /* do nothing */
       }
diff --git a/source/blender/blenlib/tests/BLI_string_utf8_test.cc b/source/blender/blenlib/tests/BLI_string_utf8_test.cc
index 13f5cb7f284..b25f2310e1e 100644
--- a/source/blender/blenlib/tests/BLI_string_utf8_test.cc
+++ b/source/blender/blenlib/tests/BLI_string_utf8_test.cc
@@ -2,6 +2,7 @@
 
 #include "testing/testing.h"
 
+#include "BLI_rand.h"
 #include "BLI_string.h"
 #include "BLI_string_utf8.h"
 #include "BLI_utildefines.h"
@@ -11,7 +12,8 @@
  * quite their share of lines, they deserved their own file. */
 
 /* -------------------------------------------------------------------- */
-/* tests */
+/** \name Test #BLI_str_utf8_invalid_strip
+ * \{ */
 
 /* Breaking strings is confusing here, prefer over-long lines. */
 /* clang-format off */
@@ -284,3 +286,110 @@ TEST(string, Utf8InvalidBytes)
     EXPECT_STREQ(buff, tst_stripped);
   }
 }
+
+/** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name Test #BLI_str_utf8_as_unicode_step
+ * \{ */
+
+static size_t utf8_as_char32(const char *str, const char str_len, char32_t *r_result)
+{
+  size_t i = 0, result_len = 0;
+  while ((i < str_len) && (str[i] != '\0')) {
+    char32_t c = BLI_str_utf8_as_unicode_step(str, str_len, &i);
+    if (c != BLI_UTF8_ERR) {
+      r_result[result_len++] = c;
+    }
+  }
+  return i;
+}
+
+template<size_t Size, size_t SizeWithPadding>
+void utf8_as_char32_test_compare_with_pad_bytes(const char utf8_src[Size])
+{
+  char utf8_src_with_pad[SizeWithPadding] = {0};
+
+  memcpy(utf8_src_with_pad, utf8_src, Size);
+
+  char32_t unicode_dst_a[Size], unicode_dst_b[Size];
+
+  memset(unicode_dst_a, 0xff, sizeof(unicode_dst_a));
+  const size_t index_a = utf8_as_char32(utf8_src, Size, unicode_dst_a);
+
+  /* Test with padded and un-padded size,
+   * to ensure that extra available space doesn't yield a different result. */
+  for (int pass = 0; pass < 2; pass++) {
+    memset(unicode_dst_b, 0xff, sizeof(unicode_dst_b));
+    const size_t index_b = utf8_as_char32(
+        utf8_src_with_pad, pass ? Size : SizeWithPadding, unicode_dst_b);
+
+    /* Check the resulting content matches. */
+    EXPECT_EQ_ARRAY(unicode_dst_a, unicode_dst_b, Size);
+    /* Check the index of the source strings match. */
+    EXPECT_EQ(index_a, index_b);
+  }
+}
+
+template<size_t Size> void utf8_as_char32_test_compare(const char utf8_src[Size])
+{
+  /* Note that 7 is a little arbitrary,
+   * chosen since it's the maximum length of multi-byte character + 1
+   * to account for any errors that read past null bytes. */
+  utf8_as_char32_test_compare_with_pad_bytes<Size, Size + 1>(utf8_src);
+  utf8_as_char32_test_compare_with_pad_bytes<Size, Size + 7>(utf8_src);
+}
+
+template<size_t Size> void utf8_as_char32_test_at_buffer_size()
+{
+  char utf8_src[Size];
+
+  /* Test uniform bytes, also with offsets ascending & descending. */
+  for (int i = 0; i <= 0xff; i++) {
+    memset(utf8_src, i, sizeof(utf8_src));
+    utf8_as_char32_test_compare<Size>(utf8_src);
+
+    /* Offset trailing bytes up and down in steps of 1, 2, 4 .. etc. */
+    if (Size > 1) {
+      for (int mul = 1; mul < 256; mul *= 2) {
+        for (int ofs = 1; ofs < (int)Size; ofs++) {
+          utf8_src[ofs] = (char)(i + (ofs * mul));
+        }
+        utf8_as_char32_test_compare<Size>(utf8_src);
+
+        for (int ofs = 1; ofs < (int)Size; ofs++) {
+          utf8_src[ofs] = (char)(i - (ofs * mul));
+        }
+        utf8_as_char32_test_compare<Size>(utf8_src);
+      }
+    }
+  }
+
+  /* Random bytes. */
+  RNG *rng = BLI_rng_new(1);
+  for (int i = 0; i < 256; i++) {
+    BLI_rng_get_char_n(rng, utf8_src, sizeof(utf8_src));
+    utf8_as_char32_test_compare<Size>(utf8_src);
+  }
+  BLI_rng_free(rng);
+}
+
+TEST(string, Utf8AsUnicodeStep)
+{
+
+  /* Run tests at different buffer sizes. */
+  utf8_as_char32_test_at_buffer_size<1>();
+  utf8_as_char32_test_at_buffer_size<2>();
+  utf8_as_char32_test_at_buffer_size<3>();
+  utf8_as_char32_test_at_buffer_size<4>();
+  utf8_as_char32_test_at_buffer_size<5>();
+  utf8_as_char32_test_at_buffer_size<6>();
+  utf8_as_char32_test_at_buffer_size<7>();
+  utf8_as_char32_test_at_buffer_size<8>();
+  utf8_as_char32_test_at_buffer_size<9>();
+  utf8_as_char32_test_at_buffer_size<10>();
+  utf8_as_char32_test_at_buffer_size<11>();
+  utf8_as_char32_test_at_buffer_size<12>();
+}
+
+/** \} */
diff --git a/source/blender/editors/space_text/text_ops.c b/source/blender/editors/space_text/text_ops.c
index b5fcadbefe8..f480f60a2b9 100644
--- a/source/blender/editors/space_text/text_ops.c
+++ b/source/blender/editors/space_text/text_ops.c
@@ -3424,25 +3424,26 @@ static int text_insert_exec(bContext *C, wmOperator *op)
   SpaceText *st = CTX_wm_space_text(C);
   Text *text = CTX_data_edit_text(C);
   char *str;
+  int str_len;
   bool done = false;
   size_t i = 0;
   uint code;
 
   text_drawcache_tag_update(st, 0);
 
-  str = RNA_string_get_alloc(op->ptr, "text", NULL, 0, NULL);
+  str = RNA_string_get_alloc(op->ptr, "text", NULL, 0, &str_len);
 
   ED_text_undo_push_init(C);
 
   if (st && st->overwrite) {
     while (str[i]) {
-      code = BLI_str_utf8_as_unicode_step(str, &i);
+      code = BLI_str_utf8_as_unicode_step(str, str_len, &i);
       done |= txt_replace_char(text, code);
     }
   }
   else {
     while (str[i]) {
-      code = BLI_str_utf8_as_unicode_step(str, &i);
+      code = BLI_str_utf8_as_unicode_step(str, str_len, &i);
       done |= txt_add_char(text, code);
     }
   }