From 457302b67b9de6a92240c2736306cfa01187101d Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Sat, 28 Aug 2021 22:44:55 +1000 Subject: BLI_string_utf8: add buffer size arg to BLI_str_utf8_from_unicode Besides helping to avoid buffer overflow errors this reduces complexity of BLI_str_utf32_as_utf8 which needed a special loop for the last 6 characters to avoid writing past the buffer bounds. Also add BLI_str_utf8_from_unicode_len which only returns the length. --- source/blender/blenkernel/intern/object_dupli.cc | 4 +- source/blender/blenkernel/intern/text.c | 7 +- source/blender/blenlib/BLI_string_utf8.h | 4 +- source/blender/blenlib/intern/string_utf8.c | 158 ++++++++++----------- source/blender/editors/space_console/console_ops.c | 2 +- source/blender/editors/space_text/text_ops.c | 2 +- 6 files changed, 84 insertions(+), 93 deletions(-) diff --git a/source/blender/blenkernel/intern/object_dupli.cc b/source/blender/blenkernel/intern/object_dupli.cc index 141a9a25eca..a46ac4b1175 100644 --- a/source/blender/blenkernel/intern/object_dupli.cc +++ b/source/blender/blenkernel/intern/object_dupli.cc @@ -653,10 +653,10 @@ static Object *find_family_object( return *ob_pt; } - char ch_utf8[7]; + char ch_utf8[BLI_UTF8_MAX + 1]; size_t ch_utf8_len; - ch_utf8_len = BLI_str_utf8_from_unicode(ch, ch_utf8); + ch_utf8_len = BLI_str_utf8_from_unicode(ch, ch_utf8, sizeof(ch_utf8) - 1); ch_utf8[ch_utf8_len] = '\0'; ch_utf8_len += 1; /* Compare with null terminator. */ diff --git a/source/blender/blenkernel/intern/text.c b/source/blender/blenkernel/intern/text.c index bdc82fe626c..6b7b3213a83 100644 --- a/source/blender/blenkernel/intern/text.c +++ b/source/blender/blenkernel/intern/text.c @@ -329,7 +329,8 @@ int txt_extended_ascii_as_utf8(char **str) memcpy(newstr + mi, (*str) + i, bad_char); - BLI_str_utf8_from_unicode((*str)[i + bad_char], newstr + mi + bad_char); + const int mofs = mi + bad_char; + BLI_str_utf8_from_unicode((*str)[i + bad_char], newstr + mofs, (length + added) - mofs); i += bad_char + 1; mi += bad_char + 2; } @@ -2005,7 +2006,7 @@ static bool txt_add_char_intern(Text *text, unsigned int add, bool replace_tabs) txt_delete_sel(text); - add_len = BLI_str_utf8_from_unicode(add, ch); + add_len = BLI_str_utf8_from_unicode(add, ch, sizeof(ch)); tmp = MEM_mallocN(text->curl->len + add_len + 1, "textline_string"); @@ -2061,7 +2062,7 @@ bool txt_replace_char(Text *text, unsigned int add) del = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &del_size); del_size -= text->curc; UNUSED_VARS(del); - add_size = BLI_str_utf8_from_unicode(add, ch); + add_size = BLI_str_utf8_from_unicode(add, ch, sizeof(ch)); if (add_size > del_size) { char *tmp = MEM_mallocN(text->curl->len + add_size - del_size + 1, "textline_string"); diff --git a/source/blender/blenlib/BLI_string_utf8.h b/source/blender/blenlib/BLI_string_utf8.h index 937b36758f2..3b7463affc0 100644 --- a/source/blender/blenlib/BLI_string_utf8.h +++ b/source/blender/blenlib/BLI_string_utf8.h @@ -48,7 +48,9 @@ unsigned int BLI_str_utf8_as_unicode_step_or_error( const char *__restrict p, size_t p_len, size_t *__restrict index) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1, 3); -size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf); +size_t BLI_str_utf8_from_unicode_len(unsigned int c) ATTR_WARN_UNUSED_RESULT; +size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf, const size_t outbuf_len) + ATTR_NONNULL(2); size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w, const char *__restrict src_c, const size_t maxncpy) ATTR_NONNULL(1, 2); diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c index 222b4df7c0e..3a5e2713b76 100644 --- a/source/blender/blenlib/intern/string_utf8.c +++ b/source/blender/blenlib/intern/string_utf8.c @@ -296,36 +296,19 @@ size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst, const wchar_t *__restrict src, const size_t maxncpy) { - const size_t maxlen = maxncpy - 1; - /* #BLI_UTF8_MAX is max utf8 length of an unicode char. */ - const int64_t maxlen_secured = (int64_t)maxlen - BLI_UTF8_MAX; - size_t len = 0; - BLI_assert(maxncpy != 0); - + size_t len = 0; #ifdef DEBUG_STRSIZE memset(dst, 0xff, sizeof(*dst) * maxncpy); #endif - - while (*src && len <= maxlen_secured) { - len += BLI_str_utf8_from_unicode((uint)*src++, dst + len); - } - - /* We have to be more careful for the last six bytes, - * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */ - while (*src) { - char t[BLI_UTF8_MAX]; - size_t l = BLI_str_utf8_from_unicode((uint)*src++, t); - BLI_assert(l <= BLI_UTF8_MAX); - if (len + l > maxlen) { - break; - } - memcpy(dst + len, t, l); - len += l; + while (*src && len < maxncpy) { + len += BLI_str_utf8_from_unicode((uint)*src++, dst + len, maxncpy - len); } - dst[len] = '\0'; - + /* Return the correct length when part of the final byte did not fit into the string. */ + while ((len > 0) && UNLIKELY(dst[len - 1] == '\0')) { + len--; + } return len; } @@ -335,7 +318,7 @@ size_t BLI_wstrlen_utf8(const wchar_t *src) size_t len = 0; while (*src) { - len += BLI_str_utf8_from_unicode((uint)*src++, NULL); + len += BLI_str_utf8_from_unicode_len((uint)*src++); } return len; @@ -608,56 +591,78 @@ uint BLI_str_utf8_as_unicode_step(const char *__restrict p, } /* was g_unichar_to_utf8 */ + +#define UTF8_VARS_FROM_CHAR32(Char, First, Len) \ + if (Char < 0x80) { \ + First = 0; \ + Len = 1; \ + } \ + else if (Char < 0x800) { \ + First = 0xc0; \ + Len = 2; \ + } \ + else if (Char < 0x10000) { \ + First = 0xe0; \ + Len = 3; \ + } \ + else if (Char < 0x200000) { \ + First = 0xf0; \ + Len = 4; \ + } \ + else if (Char < 0x4000000) { \ + First = 0xf8; \ + Len = 5; \ + } \ + else { \ + First = 0xfc; \ + Len = 6; \ + } \ + (void)0 + +size_t BLI_str_utf8_from_unicode_len(const uint c) +{ + /* If this gets modified, also update the copy in g_string_insert_unichar() */ + uint len = 0; + uint first; + + UTF8_VARS_FROM_CHAR32(c, first, len); + (void)first; + + return len; +} + /** * BLI_str_utf8_from_unicode: + * * \param c: a Unicode character code - * \param outbuf: output buffer, must have at least 6 bytes of space. - * If %NULL, the length will be computed and returned - * and nothing will be written to outbuf. + * \param outbuf: output buffer, must have at least `outbuf_len` bytes of space. + * If the length required by `c` exceeds `outbuf_len`, + * the bytes available bytes will be zeroed and `outbuf_len` returned. * * Converts a single character to UTF-8. * - * \return number of bytes written + * \return number of bytes written. */ -size_t BLI_str_utf8_from_unicode(uint c, char *outbuf) +size_t BLI_str_utf8_from_unicode(uint c, char *outbuf, const size_t outbuf_len) + { /* If this gets modified, also update the copy in g_string_insert_unichar() */ uint len = 0; uint first; - uint i; - if (c < 0x80) { - first = 0; - len = 1; - } - else if (c < 0x800) { - first = 0xc0; - len = 2; - } - else if (c < 0x10000) { - first = 0xe0; - len = 3; - } - else if (c < 0x200000) { - first = 0xf0; - len = 4; - } - else if (c < 0x4000000) { - first = 0xf8; - len = 5; - } - else { - first = 0xfc; - len = 6; + UTF8_VARS_FROM_CHAR32(c, first, len); + + if (UNLIKELY(outbuf_len < len)) { + /* NULL terminate instead of writing a partial byte. */ + memset(outbuf, 0x0, outbuf_len); + return outbuf_len; } - if (outbuf) { - for (i = len - 1; i > 0; i--) { - outbuf[i] = (c & 0x3f) | 0x80; - c >>= 6; - } - outbuf[0] = c | first; + for (uint i = len - 1; i > 0; i--) { + outbuf[i] = (c & 0x3f) | 0x80; + c >>= 6; } + outbuf[0] = c | first; return len; } @@ -701,36 +706,19 @@ size_t BLI_str_utf32_as_utf8(char *__restrict dst, const char32_t *__restrict src, const size_t maxncpy) { - const size_t maxlen = maxncpy - 1; - /* #BLI_UTF8_MAX is max utf8 length of an unicode char. */ - const int64_t maxlen_secured = (int64_t)maxlen - BLI_UTF8_MAX; - size_t len = 0; - BLI_assert(maxncpy != 0); - + size_t len = 0; #ifdef DEBUG_STRSIZE memset(dst, 0xff, sizeof(*dst) * maxncpy); #endif - - while (*src && len <= maxlen_secured) { - len += BLI_str_utf8_from_unicode((uint)*src++, dst + len); + while (*src && len < maxncpy) { + len += BLI_str_utf8_from_unicode((uint)*src++, dst + len, maxncpy - len); } - - /* We have to be more careful for the last six bytes, - * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */ - while (*src) { - char t[BLI_UTF8_MAX]; - size_t l = BLI_str_utf8_from_unicode((uint)*src++, t); - BLI_assert(l <= BLI_UTF8_MAX); - if (len + l > maxlen) { - break; - } - memcpy(dst + len, t, l); - len += l; - } - dst[len] = '\0'; - + /* Return the correct length when part of the final byte did not fit into the string. */ + while ((len > 0) && UNLIKELY(dst[len - 1] == '\0')) { + len--; + } return len; } @@ -740,7 +728,7 @@ size_t BLI_str_utf32_as_utf8_len(const char32_t *src) size_t len = 0; while (*src) { - len += BLI_str_utf8_from_unicode((uint)*src++, NULL); + len += BLI_str_utf8_from_unicode_len((uint)*src++); } return len; diff --git a/source/blender/editors/space_console/console_ops.c b/source/blender/editors/space_console/console_ops.c index 763beb8671b..c6fb2560dc0 100644 --- a/source/blender/editors/space_console/console_ops.c +++ b/source/blender/editors/space_console/console_ops.c @@ -435,7 +435,7 @@ static int console_insert_invoke(bContext *C, wmOperator *op, const wmEvent *eve } else { /* in theory, ghost can set value to extended ascii here */ - len = BLI_str_utf8_from_unicode(event->ascii, str); + len = BLI_str_utf8_from_unicode(event->ascii, str, sizeof(str) - 1); } str[len] = '\0'; RNA_string_set(op->ptr, "text", str); diff --git a/source/blender/editors/space_text/text_ops.c b/source/blender/editors/space_text/text_ops.c index b7185766224..c3bc474b98a 100644 --- a/source/blender/editors/space_text/text_ops.c +++ b/source/blender/editors/space_text/text_ops.c @@ -3486,7 +3486,7 @@ static int text_insert_invoke(bContext *C, wmOperator *op, const wmEvent *event) } else { /* in theory, ghost can set value to extended ascii here */ - len = BLI_str_utf8_from_unicode(event->ascii, str); + len = BLI_str_utf8_from_unicode(event->ascii, str, sizeof(str) - 1); } str[len] = '\0'; RNA_string_set(op->ptr, "text", str); -- cgit v1.2.3