Fix BLI_str_utf8_as_unicode_step reading past intended bounds

Add a string length argument to BLI_str_utf8_as_unicode_step to prevent reading past the buffer bounds or the intended range since some callers of this function take a string length to operate on part of the string. Font drawing for example didn't respect the length argument, potentially causing a buffer over-read with multi-byte characters that could read past the end of the string. The following command would read 5 bytes past the end of the input. `BLF_draw(font_id, (char[]){252}, 1);` In practice strings are typically null terminated so this didn't crash reading past buffer bounds. Nevertheless, this wasn't correct and could cause bugs in the future. Clamping by the length now has the same behavior as a null byte. Add test to ensure this is working as intended.
author: Campbell Barton <ideasman42@gmail.com> 2021-08-24 06:25:26 +0300
committer: Campbell Barton <ideasman42@gmail.com> 2021-08-24 07:25:15 +0300
commit: 8b55cda04812255922f488ad6bacd228d5d290a6 (patch)
tree: 9d45f358d6cc1835c461763fdc0c83872854c331 /source
parent: 8371df8b1c12bdae574370d77819c46d8280f20f (diff)
6 files changed, 179 insertions, 46 deletions
diff --git a/source/blender/blenfont/intern/blf_font.c b/source/blender/blenfont/intern/blf_font.c
index 50b4bb09b7b..5ad48aa08d4 100644
--- a/source/blender/blenfont/intern/blf_font.c
+++ b/source/blender/blenfont/intern/blf_font.c
@@ -298,7 +298,7 @@ static void blf_batch_draw_end(void)
  */
 
 BLI_INLINE GlyphBLF *blf_utf8_next_fast(
-    FontBLF *font, GlyphCacheBLF *gc, const char *str, size_t *i_p, uint *r_c)
+    FontBLF *font, GlyphCacheBLF *gc, const char *str, size_t str_len, size_t *i_p, uint *r_c)
 {
   GlyphBLF *g;
   if ((*r_c = str[*i_p]) < GLYPH_ASCII_TABLE_SIZE) {
@@ -309,7 +309,7 @@ BLI_INLINE GlyphBLF *blf_utf8_next_fast(
     }
     (*i_p)++;
   }
-  else if ((*r_c = BLI_str_utf8_as_unicode_step(str, i_p)) != BLI_UTF8_ERR) {
+  else if ((*r_c = BLI_str_utf8_as_unicode_step(str, str_len, i_p)) != BLI_UTF8_ERR) {
     g = blf_glyph_search(gc, *r_c);
     if (UNLIKELY(g == NULL)) {
       g = blf_glyph_add(font, gc, FT_Get_Char_Index(font->face, *r_c), *r_c);
@@ -382,7 +382,7 @@ static void blf_font_draw_ex(FontBLF *font,
   blf_batch_draw_begin(font);
 
   while ((i < str_len) && str[i]) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -478,7 +478,7 @@ int blf_font_draw_mono(FontBLF *font, const char *str, const size_t str_len, int
   blf_batch_draw_begin(font);
 
   while ((i < str_len) && str[i]) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -535,7 +535,7 @@ static void blf_font_draw_buffer_ex(FontBLF *font,
   /* another buffer specific call for color conversion */
 
   while ((i < str_len) && str[i]) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -703,7 +703,7 @@ size_t blf_font_width_to_strlen(
 
   for (i_prev = i = 0, width_new = pen_x = 0, g_prev = NULL, c_prev = 0; (i < str_len) && str[i];
        i_prev = i, width_new = pen_x, c_prev = c, g_prev = g) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (blf_font_width_to_strlen_glyph_process(font, c_prev, c, g_prev, g, &pen_x, width_i)) {
       break;
@@ -737,7 +737,7 @@ size_t blf_font_width_to_rstrlen(
   i_prev = (size_t)((s_prev != NULL) ? s_prev - str : 0);
 
   i_tmp = i;
-  g = blf_utf8_next_fast(font, gc, str, &i_tmp, &c);
+  g = blf_utf8_next_fast(font, gc, str, str_len, &i_tmp, &c);
   for (width_new = pen_x = 0; (s != NULL);
        i = i_prev, s = s_prev, c = c_prev, g = g_prev, g_prev = NULL, width_new = pen_x) {
     s_prev = BLI_str_find_prev_char_utf8(str, s);
@@ -745,7 +745,7 @@ size_t blf_font_width_to_rstrlen(
 
     if (s_prev != NULL) {
       i_tmp = i_prev;
-      g_prev = blf_utf8_next_fast(font, gc, str, &i_tmp, &c_prev);
+      g_prev = blf_utf8_next_fast(font, gc, str, str_len, &i_tmp, &c_prev);
       BLI_assert(i_tmp == i);
     }
 
@@ -788,7 +788,7 @@ static void blf_font_boundbox_ex(FontBLF *font,
   box->ymax = -32000.0f;
 
   while ((i < str_len) && str[i]) {
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -961,7 +961,7 @@ static void blf_font_boundbox_foreach_glyph_ex(FontBLF *font,
 
   while ((i < str_len) && str[i]) {
     i_curr = i;
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -1051,7 +1051,7 @@ static void blf_font_wrap_apply(FontBLF *font,
     size_t i_curr = i;
     bool do_draw = false;
 
-    g = blf_utf8_next_fast(font, gc, str, &i, &c);
+    g = blf_utf8_next_fast(font, gc, str, str_len, &i, &c);
 
     if (UNLIKELY(c == BLI_UTF8_ERR)) {
       break;
@@ -1202,7 +1202,7 @@ int blf_font_count_missing_chars(FontBLF *font,
     if ((c = str[i]) < GLYPH_ASCII_TABLE_SIZE) {
       i++;
     }
-    else if ((c = BLI_str_utf8_as_unicode_step(str, &i)) != BLI_UTF8_ERR) {
+    else if ((c = BLI_str_utf8_as_unicode_step(str, str_len, &i)) != BLI_UTF8_ERR) {
       if (FT_Get_Char_Index((font)->face, c) == 0) {
         missing++;
       }
diff --git a/source/blender/blenkernel/intern/text.c b/source/blender/blenkernel/intern/text.c
index 06137f5d110..c2ab91251b6 100644
--- a/source/blender/blenkernel/intern/text.c
+++ b/source/blender/blenkernel/intern/text.c
@@ -1660,7 +1660,7 @@ void txt_insert_buf(Text *text, const char *in_buffer)
 
   /* Read the first line (or as close as possible */
   while (buffer[i] && buffer[i] != '\n') {
-    txt_add_raw_char(text, BLI_str_utf8_as_unicode_step(buffer, &i));
+    txt_add_raw_char(text, BLI_str_utf8_as_unicode_step(buffer, len, &i));
   }
 
   if (buffer[i] == '\n') {
@@ -1682,7 +1682,7 @@ void txt_insert_buf(Text *text, const char *in_buffer)
       }
       else {
         for (j = i - l; j < i && j < len;) {
-          txt_add_raw_char(text, BLI_str_utf8_as_unicode_step(buffer, &j));
+          txt_add_raw_char(text, BLI_str_utf8_as_unicode_step(buffer, len, &j));
         }
         break;
       }
diff --git a/source/blender/blenlib/BLI_string_utf8.h b/source/blender/blenlib/BLI_string_utf8.h
index e1d7e2c58f7..b936e39731d 100644
--- a/source/blender/blenlib/BLI_string_utf8.h
+++ b/source/blender/blenlib/BLI_string_utf8.h
@@ -43,8 +43,10 @@ unsigned int BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *
     ATTR_NONNULL();
 unsigned int BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p,
                                                    size_t *__restrict index) ATTR_NONNULL();
-unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t *__restrict index)
-    ATTR_NONNULL();
+unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p,
+                                          size_t p_len,
+                                          size_t *__restrict index) ATTR_NONNULL(1, 3);
+
 size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf);
 size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
                              const char *__restrict src_c,
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index 5710bd6b150..dbde5221d7e 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -582,49 +582,69 @@ uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__r
 
 /**
  * Another variant that steps over the index.
+ *
+ * \param p: The text to step over.
+ * \param p_len: The length of `p`.
+ * \param index: Index of `p` to step over.
+ *
  * \note currently this also falls back to latin1 for text drawing.
+ *
+ * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes)
+ * must have the same behavior is encountering a nil byte,
+ * so functions that only use the first part of a string has matching behavior to functions
+ * that null terminate the text.
  */
-uint BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t *__restrict index)
+uint BLI_str_utf8_as_unicode_step(const char *__restrict p,
+                                  const size_t p_len,
+                                  size_t *__restrict index)
 {
   int i, len;
   uint mask = 0;
   uint result;
-  unsigned char c;
+  const char c = p[*index];
 
-  p += *index;
-  c = (unsigned char)*p;
+  BLI_assert(*index < p_len);
+  BLI_assert(c != '\0');
 
   UTF8_COMPUTE(c, mask, len, -1);
   if (UNLIKELY(len == -1)) {
-    /* when called with NULL end, result will never be NULL,
-     * checks for a NULL character */
-    const char *p_next = BLI_str_find_next_char_utf8(p, NULL);
-    /* will never return the same pointer unless '\0',
-     * eternal loop is prevented */
-    *index += (size_t)(p_next - p);
-    return BLI_UTF8_ERR;
+    const char *p_next = BLI_str_find_next_char_utf8(p + *index, p + p_len);
+    /* #BLI_str_find_next_char_utf8 ensures the nil byte will terminate.
+     * so there is no chance this sets the index past the nil byte (assert this is the case). */
+    BLI_assert(p_next || (memchr(p + *index, '\0', p_len - *index) == NULL));
+    len = (int)((p_next ? (size_t)(p_next - p) : p_len) - *index);
+    result = BLI_UTF8_ERR;
+  }
+  else if (UNLIKELY(*index + (size_t)len > p_len)) {
+    /* A multi-byte character reads past the buffer bounds,
+     * match the behavior of encountering an byte with invalid encoding below. */
+    len = 1;
+    result = (uint)c;
   }
-
-  /* this is tricky since there are a few ways we can bail out of bad unicode
-   * values, 3 possible solutions. */
+  else {
+    /* This is tricky since there are a few ways we can bail out of bad unicode
+     * values, 3 possible solutions. */
+    p += *index;
 #if 0
-  UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
+    UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
 #elif 1
-  /* WARNING: this is NOT part of glib, or supported by similar functions.
-   * this is added for text drawing because some filepaths can have latin1
-   * characters */
-  UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-  if (result == BLI_UTF8_ERR) {
-    len = 1;
-    result = *p;
-  }
-  /* end warning! */
+    /* WARNING: this is NOT part of glib, or supported by similar functions.
+     * this is added for text drawing because some filepaths can have latin1
+     * characters */
+    UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
+    if (result == BLI_UTF8_ERR) {
+      len = 1;
+      result = (uint)c;
+    }
+    /* end warning! */
 #else
-  /* without a fallback like '?', text drawing will stop on this value */
-  UTF8_GET(result, p, i, mask, len, '?');
+    /* Without a fallback like '?', text drawing will stop on this value. */
+    UTF8_GET(result, p, i, mask, len, '?');
 #endif
+  }
 
   *index += (size_t)len;
+  BLI_assert(*index <= p_len);
   return result;
 }
 
@@ -810,6 +830,7 @@ char *BLI_str_find_next_char_utf8(const char *p, const char *end)
 {
   if (*p) {
     if (end) {
+      BLI_assert(end >= p);
       for (++p; p < end && (*p & 0xc0) == 0x80; p++) {
         /* do nothing */
       }
diff --git a/source/blender/blenlib/tests/BLI_string_utf8_test.cc b/source/blender/blenlib/tests/BLI_string_utf8_test.cc
index 13f5cb7f284..b25f2310e1e 100644
--- a/source/blender/blenlib/tests/BLI_string_utf8_test.cc
+++ b/source/blender/blenlib/tests/BLI_string_utf8_test.cc
@@ -2,6 +2,7 @@
 
 #include "testing/testing.h"
 
+#include "BLI_rand.h"
 #include "BLI_string.h"
 #include "BLI_string_utf8.h"
 #include "BLI_utildefines.h"
@@ -11,7 +12,8 @@
  * quite their share of lines, they deserved their own file. */
 
 /* -------------------------------------------------------------------- */
-/* tests */
+/** \name Test #BLI_str_utf8_invalid_strip
+ * \{ */
 
 /* Breaking strings is confusing here, prefer over-long lines. */
 /* clang-format off */
@@ -284,3 +286,110 @@ TEST(string, Utf8InvalidBytes)
     EXPECT_STREQ(buff, tst_stripped);
   }
 }
+
+/** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name Test #BLI_str_utf8_as_unicode_step
+ * \{ */
+
+static size_t utf8_as_char32(const char *str, const char str_len, char32_t *r_result)
+{
+  size_t i = 0, result_len = 0;
+  while ((i < str_len) && (str[i] != '\0')) {
+    char32_t c = BLI_str_utf8_as_unicode_step(str, str_len, &i);
+    if (c != BLI_UTF8_ERR) {
+      r_result[result_len++] = c;
+    }
+  }
+  return i;
+}
+
+template<size_t Size, size_t SizeWithPadding>
+void utf8_as_char32_test_compare_with_pad_bytes(const char utf8_src[Size])
+{
+  char utf8_src_with_pad[SizeWithPadding] = {0};
+
+  memcpy(utf8_src_with_pad, utf8_src, Size);
+
+  char32_t unicode_dst_a[Size], unicode_dst_b[Size];
+
+  memset(unicode_dst_a, 0xff, sizeof(unicode_dst_a));
+  const size_t index_a = utf8_as_char32(utf8_src, Size, unicode_dst_a);
+
+  /* Test with padded and un-padded size,
+   * to ensure that extra available space doesn't yield a different result. */
+  for (int pass = 0; pass < 2; pass++) {
+    memset(unicode_dst_b, 0xff, sizeof(unicode_dst_b));
+    const size_t index_b = utf8_as_char32(
+        utf8_src_with_pad, pass ? Size : SizeWithPadding, unicode_dst_b);
+
+    /* Check the resulting content matches. */
+    EXPECT_EQ_ARRAY(unicode_dst_a, unicode_dst_b, Size);
+    /* Check the index of the source strings match. */
+    EXPECT_EQ(index_a, index_b);
+  }
+}
+
+template<size_t Size> void utf8_as_char32_test_compare(const char utf8_src[Size])
+{
+  /* Note that 7 is a little arbitrary,
+   * chosen since it's the maximum length of multi-byte character + 1
+   * to account for any errors that read past null bytes. */
+  utf8_as_char32_test_compare_with_pad_bytes<Size, Size + 1>(utf8_src);
+  utf8_as_char32_test_compare_with_pad_bytes<Size, Size + 7>(utf8_src);
+}
+
+template<size_t Size> void utf8_as_char32_test_at_buffer_size()
+{
+  char utf8_src[Size];
+
+  /* Test uniform bytes, also with offsets ascending & descending. */
+  for (int i = 0; i <= 0xff; i++) {
+    memset(utf8_src, i, sizeof(utf8_src));
+    utf8_as_char32_test_compare<Size>(utf8_src);
+
+    /* Offset trailing bytes up and down in steps of 1, 2, 4 .. etc. */
+    if (Size > 1) {
+      for (int mul = 1; mul < 256; mul *= 2) {
+        for (int ofs = 1; ofs < (int)Size; ofs++) {
+          utf8_src[ofs] = (char)(i + (ofs * mul));
+        }
+        utf8_as_char32_test_compare<Size>(utf8_src);
+
+        for (int ofs = 1; ofs < (int)Size; ofs++) {
+          utf8_src[ofs] = (char)(i - (ofs * mul));
+        }
+        utf8_as_char32_test_compare<Size>(utf8_src);
+      }
+    }
+  }
+
+  /* Random bytes. */
+  RNG *rng = BLI_rng_new(1);
+  for (int i = 0; i < 256; i++) {
+    BLI_rng_get_char_n(rng, utf8_src, sizeof(utf8_src));
+    utf8_as_char32_test_compare<Size>(utf8_src);
+  }
+  BLI_rng_free(rng);
+}
+
+TEST(string, Utf8AsUnicodeStep)
+{
+
+  /* Run tests at different buffer sizes. */
+  utf8_as_char32_test_at_buffer_size<1>();
+  utf8_as_char32_test_at_buffer_size<2>();
+  utf8_as_char32_test_at_buffer_size<3>();
+  utf8_as_char32_test_at_buffer_size<4>();
+  utf8_as_char32_test_at_buffer_size<5>();
+  utf8_as_char32_test_at_buffer_size<6>();
+  utf8_as_char32_test_at_buffer_size<7>();
+  utf8_as_char32_test_at_buffer_size<8>();
+  utf8_as_char32_test_at_buffer_size<9>();
+  utf8_as_char32_test_at_buffer_size<10>();
+  utf8_as_char32_test_at_buffer_size<11>();
+  utf8_as_char32_test_at_buffer_size<12>();
+}
+
+/** \} */
diff --git a/source/blender/editors/space_text/text_ops.c b/source/blender/editors/space_text/text_ops.c
index b5fcadbefe8..f480f60a2b9 100644
--- a/source/blender/editors/space_text/text_ops.c
+++ b/source/blender/editors/space_text/text_ops.c
@@ -3424,25 +3424,26 @@ static int text_insert_exec(bContext *C, wmOperator *op)
   SpaceText *st = CTX_wm_space_text(C);
   Text *text = CTX_data_edit_text(C);
   char *str;
+  int str_len;
   bool done = false;
   size_t i = 0;
   uint code;
 
   text_drawcache_tag_update(st, 0);
 
-  str = RNA_string_get_alloc(op->ptr, "text", NULL, 0, NULL);
+  str = RNA_string_get_alloc(op->ptr, "text", NULL, 0, &str_len);
 
   ED_text_undo_push_init(C);
 
   if (st && st->overwrite) {
     while (str[i]) {
-      code = BLI_str_utf8_as_unicode_step(str, &i);
+      code = BLI_str_utf8_as_unicode_step(str, str_len, &i);
       done |= txt_replace_char(text, code);
     }
   }
   else {
     while (str[i]) {
-      code = BLI_str_utf8_as_unicode_step(str, &i);
+      code = BLI_str_utf8_as_unicode_step(str, str_len, &i);
       done |= txt_add_char(text, code);
     }
   }
author	Campbell Barton <ideasman42@gmail.com>	2021-08-24 06:25:26 +0300
committer	Campbell Barton <ideasman42@gmail.com>	2021-08-24 07:25:15 +0300
commit	8b55cda04812255922f488ad6bacd228d5d290a6 (patch)
tree	9d45f358d6cc1835c461763fdc0c83872854c331 /source
parent	8371df8b1c12bdae574370d77819c46d8280f20f (diff)