Cleanup: utf8 stepping functions

Various changes to reduce risk of out of bounds errors in utf8 seeking. - Remove BLI_str_prev_char_utf8 This function could potentially scan past the beginning of a string. Use BLI_str_find_prev_char_utf8 instead which takes a limiting string start argument. - Swap arguments for BLI_str_find_prev_char_utf8 so the stepping argument is first and the limiting argument is last. This matches BLI_str_find_next_char_utf8. - Change behavior of these functions to return it the start or end pointers instead of NULL, which complicated use of these functions to calculate offsets. Callers that need to check if the limits were reached can compare the return value with the start/end pointers. - Return 'const char *' from these functions so they don't remove const from the input arguments.
author: Campbell Barton <ideasman42@gmail.com> 2021-08-27 09:42:31 +0300
committer: Campbell Barton <ideasman42@gmail.com> 2021-08-27 10:02:53 +0300
commit: 89dae554f9d5ae0204ad9c51c5ba00e14b16e858 (patch)
tree: 2dc6cc9151fe6eba664dfb5b39a14e7c620ce974
parent: 523bc981cfeecead5050e7af44bbe252c166d718 (diff)
10 files changed, 64 insertions, 97 deletions
diff --git a/source/blender/blenfont/intern/blf_font.c b/source/blender/blenfont/intern/blf_font.c
index dbcd1d6016d..27478bd7f8e 100644
--- a/source/blender/blenfont/intern/blf_font.c
+++ b/source/blender/blenfont/intern/blf_font.c
@@ -673,23 +673,23 @@ size_t blf_font_width_to_rstrlen(
   GlyphBLF *g, *g_prev;
   int pen_x, width_new;
   size_t i, i_prev, i_tmp;
-  char *s, *s_prev;
+  const char *s, *s_prev;
 
   GlyphCacheBLF *gc = blf_glyph_cache_acquire(font);
   const int width_i = (int)width;
 
   i = BLI_strnlen(str, str_len);
-  s = BLI_str_find_prev_char_utf8(str, &str[i]);
-  i = (size_t)((s != NULL) ? s - str : 0);
-  s_prev = BLI_str_find_prev_char_utf8(str, s);
-  i_prev = (size_t)((s_prev != NULL) ? s_prev - str : 0);
+  s = BLI_str_find_prev_char_utf8(&str[i], str);
+  i = (size_t)(s - str);
+  s_prev = BLI_str_find_prev_char_utf8(s, str);
+  i_prev = (size_t)(s_prev - str);
 
   i_tmp = i;
   g = blf_utf8_next_fast(font, gc, str, str_len, &i_tmp, &c);
   for (width_new = pen_x = 0; (s != NULL);
        i = i_prev, s = s_prev, c = c_prev, g = g_prev, g_prev = NULL, width_new = pen_x) {
-    s_prev = BLI_str_find_prev_char_utf8(str, s);
-    i_prev = (size_t)((s_prev != NULL) ? s_prev - str : 0);
+    s_prev = BLI_str_find_prev_char_utf8(s, str);
+    i_prev = (size_t)(s_prev - str);
 
     if (s_prev != NULL) {
       i_tmp = i_prev;
diff --git a/source/blender/blenkernel/intern/text.c b/source/blender/blenkernel/intern/text.c
index 7f1f6590e48..bdc82fe626c 100644
--- a/source/blender/blenkernel/intern/text.c
+++ b/source/blender/blenkernel/intern/text.c
@@ -933,7 +933,7 @@ void txt_move_left(Text *text, const bool sel)
       (*charp) -= tabsize;
     }
     else {
-      const char *prev = BLI_str_prev_char_utf8((*linep)->line + *charp);
+      const char *prev = BLI_str_find_prev_char_utf8((*linep)->line + *charp, (*linep)->line);
       *charp = prev - (*linep)->line;
     }
   }
@@ -1938,7 +1938,8 @@ void txt_backspace_char(Text *text)
     txt_pop_sel(text);
   }
   else { /* Just backspacing a char */
-    const char *prev = BLI_str_prev_char_utf8(text->curl->line + text->curc);
+    const char *prev = BLI_str_find_prev_char_utf8(text->curl->line + text->curc,
+                                                   text->curl->line);
     size_t c_len = prev - text->curl->line;
     c = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &c_len);
     c_len -= prev - text->curl->line;
diff --git a/source/blender/blenkernel/intern/unit.c b/source/blender/blenkernel/intern/unit.c
index 4581d410444..4e9a3c9fb2e 100644
--- a/source/blender/blenkernel/intern/unit.c
+++ b/source/blender/blenkernel/intern/unit.c
@@ -717,7 +717,7 @@ static const char *unit_find_str(const char *str, const char *substr, bool case_
       if (str_found == str ||
           /* Weak unicode support!, so "µm" won't match up be replaced by "m"
            * since non ascii utf8 values will NEVER return true */
-          isalpha_or_utf8(*BLI_str_prev_char_utf8(str_found)) == 0) {
+          isalpha_or_utf8(*BLI_str_find_prev_char_utf8(str_found, str)) == 0) {
         /* Next char cannot be alphanum. */
         int len_name = strlen(substr);
 
diff --git a/source/blender/blenlib/BLI_string_utf8.h b/source/blender/blenlib/BLI_string_utf8.h
index a9cb13a3277..937b36758f2 100644
--- a/source/blender/blenlib/BLI_string_utf8.h
+++ b/source/blender/blenlib/BLI_string_utf8.h
@@ -57,11 +57,10 @@ size_t BLI_str_utf32_as_utf8(char *__restrict dst,
                              const size_t maxncpy) ATTR_NONNULL(1, 2);
 size_t BLI_str_utf32_as_utf8_len(const char32_t *src) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);
 
-char *BLI_str_find_prev_char_utf8(const char *str, const char *p) ATTR_WARN_UNUSED_RESULT
-    ATTR_NONNULL(1, 2);
-char *BLI_str_find_next_char_utf8(const char *p, const char *end) ATTR_WARN_UNUSED_RESULT
-    ATTR_NONNULL(1);
-char *BLI_str_prev_char_utf8(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);
+const char *BLI_str_find_prev_char_utf8(const char *p, const char *str_start)
+    ATTR_WARN_UNUSED_RESULT ATTR_RETURNS_NONNULL ATTR_NONNULL(1, 2);
+const char *BLI_str_find_next_char_utf8(const char *p, const char *str_end)
+    ATTR_WARN_UNUSED_RESULT ATTR_RETURNS_NONNULL ATTR_NONNULL(1, 2);
 
 /* wchar_t functions, copied from blenders own font.c originally */
 size_t BLI_wstrlen_utf8(const wchar_t *src) ATTR_NONNULL(1) ATTR_WARN_UNUSED_RESULT;
diff --git a/source/blender/blenlib/intern/string_cursor_utf8.c b/source/blender/blenlib/intern/string_cursor_utf8.c
index f76a3114e09..eb49572f06c 100644
--- a/source/blender/blenlib/intern/string_cursor_utf8.c
+++ b/source/blender/blenlib/intern/string_cursor_utf8.c
@@ -117,7 +117,7 @@ bool BLI_str_cursor_step_next_utf8(const char *str, size_t maxlen, int *pos)
   const char *str_end = str + (maxlen + 1);
   const char *str_pos = str + (*pos);
   const char *str_next = BLI_str_find_next_char_utf8(str_pos, str_end);
-  if (str_next) {
+  if (str_next != str_end) {
     (*pos) += (str_next - str_pos);
     if ((*pos) > (int)maxlen) {
       (*pos) = (int)maxlen;
@@ -132,11 +132,9 @@ bool BLI_str_cursor_step_prev_utf8(const char *str, size_t UNUSED(maxlen), int *
 {
   if ((*pos) > 0) {
     const char *str_pos = str + (*pos);
-    const char *str_prev = BLI_str_find_prev_char_utf8(str, str_pos);
-    if (str_prev) {
-      (*pos) -= (str_pos - str_prev);
-      return true;
-    }
+    const char *str_prev = BLI_str_find_prev_char_utf8(str_pos, str);
+    (*pos) -= (str_pos - str_prev);
+    return true;
   }
 
   return false;
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index e35e2bcca3c..222b4df7c0e 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -686,12 +686,7 @@ size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
     else {
       *dst_w = '?';
       const char *src_c_next = BLI_str_find_next_char_utf8(src_c + index, src_c_end);
-      if (src_c_next != NULL) {
-        index = (size_t)(src_c_next - src_c);
-      }
-      else {
-        index += 1;
-      }
+      index = (size_t)(src_c_next - src_c);
     }
     dst_w++;
     len++;
@@ -758,31 +753,33 @@ size_t BLI_str_utf32_as_utf8_len(const char32_t *src)
  * \param p: pointer to some position within \a str
  *
  * Given a position \a p with a UTF-8 encoded string \a str, find the start
- * of the previous UTF-8 character starting before. \a p Returns %NULL if no
- * UTF-8 characters are present in \a str before \a p
+ * of the previous UTF-8 character starting before. \a p Returns \a str_start if no
+ * UTF-8 characters are present in \a str_start before \a p.
  *
  * \a p does not have to be at the beginning of a UTF-8 character. No check
  * is made to see if the character found is actually valid other than
  * it starts with an appropriate byte.
  *
- * Return value: a pointer to the found character or %NULL.
+ * \return A pointer to the found character.
  */
-char *BLI_str_find_prev_char_utf8(const char *str, const char *p)
+const char *BLI_str_find_prev_char_utf8(const char *p, const char *str_start)
 {
-  for (--p; p >= str; p--) {
-    if ((*p & 0xc0) != 0x80) {
-      return (char *)p;
+  BLI_assert(p >= str_start);
+  if (str_start < p) {
+    for (--p; p >= str_start; p--) {
+      if ((*p & 0xc0) != 0x80) {
+        return (char *)p;
+      }
     }
   }
-  return NULL;
+  return p;
 }
 
 /* was g_utf8_find_next_char */
 /**
  * BLI_str_find_next_char_utf8:
  * \param p: a pointer to a position within a UTF-8 encoded string
- * \param end: a pointer to the byte following the end of the string,
- * or %NULL to indicate that the string is nul-terminated.
+ * \param end: a pointer to the byte following the end of the string.
  *
  * Finds the start of the next UTF-8 character in the string after \a p
  *
@@ -790,50 +787,18 @@ char *BLI_str_find_prev_char_utf8(const char *str, const char *p)
  * is made to see if the character found is actually valid other than
  * it starts with an appropriate byte.
  *
- * Return value: a pointer to the found character or %NULL
- */
-char *BLI_str_find_next_char_utf8(const char *p, const char *end)
-{
-  if (*p) {
-    if (end) {
-      BLI_assert(end >= p);
-      for (++p; p < end && (*p & 0xc0) == 0x80; p++) {
-        /* do nothing */
-      }
-    }
-    else {
-      for (++p; (*p & 0xc0) == 0x80; p++) {
-        /* do nothing */
-      }
-    }
-  }
-  return (p == end) ? NULL : (char *)p;
-}
-
-/* was g_utf8_prev_char */
-/**
- * BLI_str_prev_char_utf8:
- * \param p: a pointer to a position within a UTF-8 encoded string
- *
- * Finds the previous UTF-8 character in the string before \a p
- *
- * \a p does not have to be at the beginning of a UTF-8 character. No check
- * is made to see if the character found is actually valid other than
- * it starts with an appropriate byte. If \a p might be the first
- * character of the string, you must use g_utf8_find_prev_char() instead.
- *
- * Return value: a pointer to the found character.
+ * \return a pointer to the found character or a pointer to the null terminating character '\0'.
  */
-char *BLI_str_prev_char_utf8(const char *p)
+const char *BLI_str_find_next_char_utf8(const char *p, const char *str_end)
 {
-  while (1) {
-    p--;
-    if ((*p & 0xc0) != 0x80) {
-      return (char *)p;
+  BLI_assert(p <= str_end);
+  if ((p < str_end) && (*p != '\0')) {
+    for (++p; p < str_end && (*p & 0xc0) == 0x80; p++) {
+      /* do nothing */
     }
   }
+  return p;
 }
-/* end glib copy */
 
 size_t BLI_str_partition_utf8(const char *str,
                               const uint delim[],
@@ -858,19 +823,21 @@ size_t BLI_str_partition_ex_utf8(const char *str,
                                  const char **suf,
                                  const bool from_right)
 {
-  const uint *d;
   const size_t str_len = end ? (size_t)(end - str) : strlen(str);
-  size_t index;
+  if (end == NULL) {
+    end = str + str_len;
+  }
 
   /* Note that here, we assume end points to a valid utf8 char! */
-  BLI_assert(end == NULL || (end >= str && (BLI_str_utf8_as_unicode(end) != BLI_UTF8_ERR)));
+  BLI_assert((end >= str) && (BLI_str_utf8_as_unicode(end) != BLI_UTF8_ERR));
 
   *suf = (char *)(str + str_len);
 
-  for (*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, str + str_len) : str),
-      index = 0;
-       *sep >= str && (!end || *sep < end) && **sep != '\0';
-       *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) {
+  size_t index;
+  for (*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(end, str) : str), index = 0;
+       from_right ? (*sep > str) : ((*sep < end) && (**sep != '\0'));
+       *sep = (char *)(from_right ? (str != *sep ? BLI_str_find_prev_char_utf8(*sep, str) : NULL) :
+                                    str + index)) {
     size_t index_ofs = 0;
     const uint c = BLI_str_utf8_as_unicode_step_or_error(*sep, (size_t)(end - *sep), &index_ofs);
     index += index_ofs;
@@ -880,7 +847,7 @@ size_t BLI_str_partition_ex_utf8(const char *str,
       break;
     }
 
-    for (d = delim; *d != '\0'; d++) {
+    for (const uint *d = delim; *d != '\0'; d++) {
       if (*d == c) {
         /* *suf is already correct in case from_right is true. */
         if (!from_right) {
diff --git a/source/blender/editors/interface/interface_handlers.c b/source/blender/editors/interface/interface_handlers.c
index a618daec4bc..977e9661dd9 100644
--- a/source/blender/editors/interface/interface_handlers.c
+++ b/source/blender/editors/interface/interface_handlers.c
@@ -2933,8 +2933,9 @@ static int ui_text_position_from_hidden(uiBut *but, int pos)
 {
   const char *butstr = (but->editstr) ? but->editstr : but->drawstr;
   const char *strpos = butstr;
+  const char *str_end = butstr + strlen(butstr);
   for (int i = 0; i < pos; i++) {
-    strpos = BLI_str_find_next_char_utf8(strpos, NULL);
+    strpos = BLI_str_find_next_char_utf8(strpos, str_end);
   }
 
   return (strpos - butstr);
diff --git a/source/blender/editors/interface/interface_widgets.c b/source/blender/editors/interface/interface_widgets.c
index 48f638dac33..a2b86ccd947 100644
--- a/source/blender/editors/interface/interface_widgets.c
+++ b/source/blender/editors/interface/interface_widgets.c
@@ -1498,15 +1498,15 @@ static void widget_draw_submenu_tria(const uiBut *but,
 
 static void ui_text_clip_give_prev_off(uiBut *but, const char *str)
 {
-  const char *prev_utf8 = BLI_str_find_prev_char_utf8(str, str + but->ofs);
+  const char *prev_utf8 = BLI_str_find_prev_char_utf8(str + but->ofs, str);
   const int bytes = str + but->ofs - prev_utf8;
 
   but->ofs -= bytes;
 }
 
-static void ui_text_clip_give_next_off(uiBut *but, const char *str)
+static void ui_text_clip_give_next_off(uiBut *but, const char *str, const char *str_end)
 {
-  const char *next_utf8 = BLI_str_find_next_char_utf8(str + but->ofs, NULL);
+  const char *next_utf8 = BLI_str_find_next_char_utf8(str + but->ofs, str_end);
   const int bytes = next_utf8 - (str + but->ofs);
 
   but->ofs += bytes;
@@ -1739,7 +1739,8 @@ static void ui_text_clip_cursor(const uiFontStyle *fstyle, uiBut *but, const rct
   but->strwidth = BLF_width(fstyle->uifont_id, but->editstr + but->ofs, INT_MAX);
 
   if (but->strwidth > okwidth) {
-    int len = strlen(but->editstr);
+    const int editstr_len = strlen(but->editstr);
+    int len = editstr_len;
 
     while (but->strwidth > okwidth) {
       float width;
@@ -1749,7 +1750,7 @@ static void ui_text_clip_cursor(const uiFontStyle *fstyle, uiBut *but, const rct
 
       /* if cursor is at 20 pixels of right side button we clip left */
       if (width > okwidth - 20) {
-        ui_text_clip_give_next_off(but, but->editstr);
+        ui_text_clip_give_next_off(but, but->editstr, but->editstr + editstr_len);
       }
       else {
         int bytes;
@@ -1757,7 +1758,7 @@ static void ui_text_clip_cursor(const uiFontStyle *fstyle, uiBut *but, const rct
         if (width < 20 && but->ofs > 0) {
           ui_text_clip_give_prev_off(but, but->editstr);
         }
-        bytes = BLI_str_utf8_size(BLI_str_find_prev_char_utf8(but->editstr, but->editstr + len));
+        bytes = BLI_str_utf8_size(BLI_str_find_prev_char_utf8(but->editstr + len, but->editstr));
         if (bytes == -1) {
           bytes = 1;
         }
@@ -1805,7 +1806,7 @@ static void ui_text_clip_right_label(const uiFontStyle *fstyle, uiBut *but, cons
 
     /* chop off the leading text, starting from the right */
     while (but->strwidth > okwidth && cp2 > but->drawstr) {
-      const char *prev_utf8 = BLI_str_find_prev_char_utf8(but->drawstr, cp2);
+      const char *prev_utf8 = BLI_str_find_prev_char_utf8(cp2, but->drawstr);
       const int bytes = cp2 - prev_utf8;
 
       /* shift the text after and including cp2 back by 1 char,
@@ -1825,7 +1826,7 @@ static void ui_text_clip_right_label(const uiFontStyle *fstyle, uiBut *but, cons
 
     /* after the leading text is gone, chop off the : and following space, with ofs */
     while ((but->strwidth > okwidth) && (but->ofs < 2)) {
-      ui_text_clip_give_next_off(but, but->drawstr);
+      ui_text_clip_give_next_off(but, but->drawstr, but->drawstr + drawstr_len);
       but->strwidth = BLF_width(
           fstyle->uifont_id, but->drawstr + but->ofs, sizeof(but->drawstr) - but->ofs);
       if (but->strwidth < 10) {
diff --git a/source/blender/editors/space_text/text_draw.c b/source/blender/editors/space_text/text_draw.c
index b6ba95885e4..99fcb2092c3 100644
--- a/source/blender/editors/space_text/text_draw.c
+++ b/source/blender/editors/space_text/text_draw.c
@@ -1472,7 +1472,7 @@ static void draw_brackets(const SpaceText *st, const TextDrawContext *tdc, ARegi
     /* closing bracket, search backward for open */
     fc--;
     if (c > 0) {
-      c -= linep->line + c - BLI_str_prev_char_utf8(linep->line + c);
+      c -= linep->line + c - BLI_str_find_prev_char_utf8(linep->line + c, linep->line);
     }
     while (linep) {
       while (fc >= 0) {
@@ -1493,7 +1493,7 @@ static void draw_brackets(const SpaceText *st, const TextDrawContext *tdc, ARegi
         }
         fc--;
         if (c > 0) {
-          c -= linep->line + c - BLI_str_prev_char_utf8(linep->line + c);
+          c -= linep->line + c - BLI_str_find_prev_char_utf8(linep->line + c, linep->line);
         }
       }
       if (endl) {
@@ -1508,7 +1508,7 @@ static void draw_brackets(const SpaceText *st, const TextDrawContext *tdc, ARegi
           fc = -1;
         }
         if (linep->len) {
-          c = BLI_str_prev_char_utf8(linep->line + linep->len) - linep->line;
+          c = BLI_str_find_prev_char_utf8(linep->line + linep->len, linep->line) - linep->line;
         }
         else {
           fc = -1;
diff --git a/source/blender/editors/space_text/text_ops.c b/source/blender/editors/space_text/text_ops.c
index f480f60a2b9..b7185766224 100644
--- a/source/blender/editors/space_text/text_ops.c
+++ b/source/blender/editors/space_text/text_ops.c
@@ -1955,7 +1955,7 @@ static void txt_wrap_move_eol(SpaceText *st, ARegion *region, const bool sel)
         end = MIN2(end, i);
 
         if (chop) {
-          endj = BLI_str_prev_char_utf8((*linep)->line + j) - (*linep)->line;
+          endj = BLI_str_find_prev_char_utf8((*linep)->line + j, (*linep)->line) - (*linep)->line;
         }
 
         if (endj >= oldc) {
author	Campbell Barton <ideasman42@gmail.com>	2021-08-27 09:42:31 +0300
committer	Campbell Barton <ideasman42@gmail.com>	2021-08-27 10:02:53 +0300
commit	89dae554f9d5ae0204ad9c51c5ba00e14b16e858 (patch)
tree	2dc6cc9151fe6eba664dfb5b39a14e7c620ce974
parent	523bc981cfeecead5050e7af44bbe252c166d718 (diff)