Cleanup: utf8 stepping functions

Various changes to reduce risk of out of bounds errors in utf8 seeking. - Remove BLI_str_prev_char_utf8 This function could potentially scan past the beginning of a string. Use BLI_str_find_prev_char_utf8 instead which takes a limiting string start argument. - Swap arguments for BLI_str_find_prev_char_utf8 so the stepping argument is first and the limiting argument is last. This matches BLI_str_find_next_char_utf8. - Change behavior of these functions to return it the start or end pointers instead of NULL, which complicated use of these functions to calculate offsets. Callers that need to check if the limits were reached can compare the return value with the start/end pointers. - Return 'const char *' from these functions so they don't remove const from the input arguments.
author: Campbell Barton <ideasman42@gmail.com> 2021-08-27 09:42:31 +0300
committer: Campbell Barton <ideasman42@gmail.com> 2021-08-27 10:02:53 +0300
commit: 89dae554f9d5ae0204ad9c51c5ba00e14b16e858 (patch)
tree: 2dc6cc9151fe6eba664dfb5b39a14e7c620ce974 /source/blender/blenlib
parent: 523bc981cfeecead5050e7af44bbe252c166d718 (diff)
3 files changed, 38 insertions, 74 deletions
diff --git a/source/blender/blenlib/BLI_string_utf8.h b/source/blender/blenlib/BLI_string_utf8.h
index a9cb13a3277..937b36758f2 100644
--- a/source/blender/blenlib/BLI_string_utf8.h
+++ b/source/blender/blenlib/BLI_string_utf8.h
@@ -57,11 +57,10 @@ size_t BLI_str_utf32_as_utf8(char *__restrict dst,
                              const size_t maxncpy) ATTR_NONNULL(1, 2);
 size_t BLI_str_utf32_as_utf8_len(const char32_t *src) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);
 
-char *BLI_str_find_prev_char_utf8(const char *str, const char *p) ATTR_WARN_UNUSED_RESULT
-    ATTR_NONNULL(1, 2);
-char *BLI_str_find_next_char_utf8(const char *p, const char *end) ATTR_WARN_UNUSED_RESULT
-    ATTR_NONNULL(1);
-char *BLI_str_prev_char_utf8(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);
+const char *BLI_str_find_prev_char_utf8(const char *p, const char *str_start)
+    ATTR_WARN_UNUSED_RESULT ATTR_RETURNS_NONNULL ATTR_NONNULL(1, 2);
+const char *BLI_str_find_next_char_utf8(const char *p, const char *str_end)
+    ATTR_WARN_UNUSED_RESULT ATTR_RETURNS_NONNULL ATTR_NONNULL(1, 2);
 
 /* wchar_t functions, copied from blenders own font.c originally */
 size_t BLI_wstrlen_utf8(const wchar_t *src) ATTR_NONNULL(1) ATTR_WARN_UNUSED_RESULT;
diff --git a/source/blender/blenlib/intern/string_cursor_utf8.c b/source/blender/blenlib/intern/string_cursor_utf8.c
index f76a3114e09..eb49572f06c 100644
--- a/source/blender/blenlib/intern/string_cursor_utf8.c
+++ b/source/blender/blenlib/intern/string_cursor_utf8.c
@@ -117,7 +117,7 @@ bool BLI_str_cursor_step_next_utf8(const char *str, size_t maxlen, int *pos)
   const char *str_end = str + (maxlen + 1);
   const char *str_pos = str + (*pos);
   const char *str_next = BLI_str_find_next_char_utf8(str_pos, str_end);
-  if (str_next) {
+  if (str_next != str_end) {
     (*pos) += (str_next - str_pos);
     if ((*pos) > (int)maxlen) {
       (*pos) = (int)maxlen;
@@ -132,11 +132,9 @@ bool BLI_str_cursor_step_prev_utf8(const char *str, size_t UNUSED(maxlen), int *
 {
   if ((*pos) > 0) {
     const char *str_pos = str + (*pos);
-    const char *str_prev = BLI_str_find_prev_char_utf8(str, str_pos);
-    if (str_prev) {
-      (*pos) -= (str_pos - str_prev);
-      return true;
-    }
+    const char *str_prev = BLI_str_find_prev_char_utf8(str_pos, str);
+    (*pos) -= (str_pos - str_prev);
+    return true;
   }
 
   return false;
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index e35e2bcca3c..222b4df7c0e 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -686,12 +686,7 @@ size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
     else {
       *dst_w = '?';
       const char *src_c_next = BLI_str_find_next_char_utf8(src_c + index, src_c_end);
-      if (src_c_next != NULL) {
-        index = (size_t)(src_c_next - src_c);
-      }
-      else {
-        index += 1;
-      }
+      index = (size_t)(src_c_next - src_c);
     }
     dst_w++;
     len++;
@@ -758,31 +753,33 @@ size_t BLI_str_utf32_as_utf8_len(const char32_t *src)
  * \param p: pointer to some position within \a str
  *
  * Given a position \a p with a UTF-8 encoded string \a str, find the start
- * of the previous UTF-8 character starting before. \a p Returns %NULL if no
- * UTF-8 characters are present in \a str before \a p
+ * of the previous UTF-8 character starting before. \a p Returns \a str_start if no
+ * UTF-8 characters are present in \a str_start before \a p.
  *
  * \a p does not have to be at the beginning of a UTF-8 character. No check
  * is made to see if the character found is actually valid other than
  * it starts with an appropriate byte.
  *
- * Return value: a pointer to the found character or %NULL.
+ * \return A pointer to the found character.
  */
-char *BLI_str_find_prev_char_utf8(const char *str, const char *p)
+const char *BLI_str_find_prev_char_utf8(const char *p, const char *str_start)
 {
-  for (--p; p >= str; p--) {
-    if ((*p & 0xc0) != 0x80) {
-      return (char *)p;
+  BLI_assert(p >= str_start);
+  if (str_start < p) {
+    for (--p; p >= str_start; p--) {
+      if ((*p & 0xc0) != 0x80) {
+        return (char *)p;
+      }
     }
   }
-  return NULL;
+  return p;
 }
 
 /* was g_utf8_find_next_char */
 /**
  * BLI_str_find_next_char_utf8:
  * \param p: a pointer to a position within a UTF-8 encoded string
- * \param end: a pointer to the byte following the end of the string,
- * or %NULL to indicate that the string is nul-terminated.
+ * \param end: a pointer to the byte following the end of the string.
  *
  * Finds the start of the next UTF-8 character in the string after \a p
  *
@@ -790,50 +787,18 @@ char *BLI_str_find_prev_char_utf8(const char *str, const char *p)
  * is made to see if the character found is actually valid other than
  * it starts with an appropriate byte.
  *
- * Return value: a pointer to the found character or %NULL
- */
-char *BLI_str_find_next_char_utf8(const char *p, const char *end)
-{
-  if (*p) {
-    if (end) {
-      BLI_assert(end >= p);
-      for (++p; p < end && (*p & 0xc0) == 0x80; p++) {
-        /* do nothing */
-      }
-    }
-    else {
-      for (++p; (*p & 0xc0) == 0x80; p++) {
-        /* do nothing */
-      }
-    }
-  }
-  return (p == end) ? NULL : (char *)p;
-}
-
-/* was g_utf8_prev_char */
-/**
- * BLI_str_prev_char_utf8:
- * \param p: a pointer to a position within a UTF-8 encoded string
- *
- * Finds the previous UTF-8 character in the string before \a p
- *
- * \a p does not have to be at the beginning of a UTF-8 character. No check
- * is made to see if the character found is actually valid other than
- * it starts with an appropriate byte. If \a p might be the first
- * character of the string, you must use g_utf8_find_prev_char() instead.
- *
- * Return value: a pointer to the found character.
+ * \return a pointer to the found character or a pointer to the null terminating character '\0'.
  */
-char *BLI_str_prev_char_utf8(const char *p)
+const char *BLI_str_find_next_char_utf8(const char *p, const char *str_end)
 {
-  while (1) {
-    p--;
-    if ((*p & 0xc0) != 0x80) {
-      return (char *)p;
+  BLI_assert(p <= str_end);
+  if ((p < str_end) && (*p != '\0')) {
+    for (++p; p < str_end && (*p & 0xc0) == 0x80; p++) {
+      /* do nothing */
     }
   }
+  return p;
 }
-/* end glib copy */
 
 size_t BLI_str_partition_utf8(const char *str,
                               const uint delim[],
@@ -858,19 +823,21 @@ size_t BLI_str_partition_ex_utf8(const char *str,
                                  const char **suf,
                                  const bool from_right)
 {
-  const uint *d;
   const size_t str_len = end ? (size_t)(end - str) : strlen(str);
-  size_t index;
+  if (end == NULL) {
+    end = str + str_len;
+  }
 
   /* Note that here, we assume end points to a valid utf8 char! */
-  BLI_assert(end == NULL || (end >= str && (BLI_str_utf8_as_unicode(end) != BLI_UTF8_ERR)));
+  BLI_assert((end >= str) && (BLI_str_utf8_as_unicode(end) != BLI_UTF8_ERR));
 
   *suf = (char *)(str + str_len);
 
-  for (*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, str + str_len) : str),
-      index = 0;
-       *sep >= str && (!end || *sep < end) && **sep != '\0';
-       *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) {
+  size_t index;
+  for (*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(end, str) : str), index = 0;
+       from_right ? (*sep > str) : ((*sep < end) && (**sep != '\0'));
+       *sep = (char *)(from_right ? (str != *sep ? BLI_str_find_prev_char_utf8(*sep, str) : NULL) :
+                                    str + index)) {
     size_t index_ofs = 0;
     const uint c = BLI_str_utf8_as_unicode_step_or_error(*sep, (size_t)(end - *sep), &index_ofs);
     index += index_ofs;
@@ -880,7 +847,7 @@ size_t BLI_str_partition_ex_utf8(const char *str,
       break;
     }
 
-    for (d = delim; *d != '\0'; d++) {
+    for (const uint *d = delim; *d != '\0'; d++) {
       if (*d == c) {
         /* *suf is already correct in case from_right is true. */
         if (!from_right) {
author	Campbell Barton <ideasman42@gmail.com>	2021-08-27 09:42:31 +0300
committer	Campbell Barton <ideasman42@gmail.com>	2021-08-27 10:02:53 +0300
commit	89dae554f9d5ae0204ad9c51c5ba00e14b16e858 (patch)
tree	2dc6cc9151fe6eba664dfb5b39a14e7c620ce974 /source/blender/blenlib
parent	523bc981cfeecead5050e7af44bbe252c166d718 (diff)