From 38630711a02e553f209ace9a8627a7a851820a2d Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Wed, 25 Aug 2021 15:19:00 +1000 Subject: BLI_string_utf8: remove unnecessary utf8 decoding functions Remove BLI_str_utf8_as_unicode_and_size and BLI_str_utf8_as_unicode_and_size_safe. Use BLI_str_utf8_as_unicode_step instead since it takes a buffer bounds argument to prevent buffer over-reading. --- source/blender/blenkernel/intern/text.c | 15 ++++-- source/blender/blenlib/BLI_string_utf8.h | 4 -- source/blender/blenlib/intern/string_cursor_utf8.c | 20 +++++--- source/blender/blenlib/intern/string_search.cc | 17 +++---- source/blender/blenlib/intern/string_utf8.c | 53 ++++++---------------- .../blender/editors/space_text/text_autocomplete.c | 7 ++- source/blender/makesrna/intern/rna_wm.c | 5 +- 7 files changed, 51 insertions(+), 70 deletions(-) diff --git a/source/blender/blenkernel/intern/text.c b/source/blender/blenkernel/intern/text.c index c2ab91251b6..7f1f6590e48 100644 --- a/source/blender/blenkernel/intern/text.c +++ b/source/blender/blenkernel/intern/text.c @@ -1888,8 +1888,9 @@ void txt_delete_char(Text *text) } } else { /* Just deleting a char */ - size_t c_len = 0; - c = BLI_str_utf8_as_unicode_and_size(text->curl->line + text->curc, &c_len); + size_t c_len = text->curc; + c = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &c_len); + c_len -= text->curc; UNUSED_VARS(c); memmove(text->curl->line + text->curc, @@ -1937,9 +1938,11 @@ void txt_backspace_char(Text *text) txt_pop_sel(text); } else { /* Just backspacing a char */ - size_t c_len = 0; const char *prev = BLI_str_prev_char_utf8(text->curl->line + text->curc); - c = BLI_str_utf8_as_unicode_and_size(prev, &c_len); + size_t c_len = prev - text->curl->line; + c = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &c_len); + c_len -= prev - text->curl->line; + UNUSED_VARS(c); /* source and destination overlap, don't use memcpy() */ @@ -2053,7 +2056,9 @@ bool txt_replace_char(Text *text, unsigned int add) return txt_add_char(text, add); } - del = BLI_str_utf8_as_unicode_and_size(text->curl->line + text->curc, &del_size); + del_size = text->curc; + del = BLI_str_utf8_as_unicode_step(text->curl->line, text->curl->len, &del_size); + del_size -= text->curc; UNUSED_VARS(del); add_size = BLI_str_utf8_from_unicode(add, ch); diff --git a/source/blender/blenlib/BLI_string_utf8.h b/source/blender/blenlib/BLI_string_utf8.h index 1b12147fe0f..b97c8748ca4 100644 --- a/source/blender/blenlib/BLI_string_utf8.h +++ b/source/blender/blenlib/BLI_string_utf8.h @@ -39,10 +39,6 @@ int BLI_str_utf8_size(const char *p) ATTR_NONNULL(); int BLI_str_utf8_size_safe(const char *p) ATTR_NONNULL(); /* copied from glib */ unsigned int BLI_str_utf8_as_unicode(const char *p) ATTR_NONNULL(); -unsigned int BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index) - ATTR_NONNULL(); -unsigned int BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, - size_t *__restrict index) ATTR_NONNULL(); unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t p_len, size_t *__restrict index) ATTR_NONNULL(1, 3); diff --git a/source/blender/blenlib/intern/string_cursor_utf8.c b/source/blender/blenlib/intern/string_cursor_utf8.c index 90fde02b11f..59b9f4eeca0 100644 --- a/source/blender/blenlib/intern/string_cursor_utf8.c +++ b/source/blender/blenlib/intern/string_cursor_utf8.c @@ -101,11 +101,14 @@ static eStrCursorDelimType cursor_delim_type_unicode(const uint uch) return STRCUR_DELIM_ALPHANUMERIC; /* Not quite true, but ok for now */ } -static eStrCursorDelimType cursor_delim_type_utf8(const char *ch_utf8) +static eStrCursorDelimType cursor_delim_type_utf8(const char *ch_utf8, + const size_t ch_utf8_len, + const int pos) { /* for full unicode support we really need to have large lookup tables to figure * out what's what in every possible char set - and python, glib both have these. */ - uint uch = BLI_str_utf8_as_unicode(ch_utf8); + size_t index = (size_t)pos; + uint uch = BLI_str_utf8_as_unicode_step_or_error(ch_utf8, ch_utf8_len, &index); return cursor_delim_type_unicode(uch); } @@ -157,14 +160,16 @@ void BLI_str_cursor_step_utf8(const char *str, } if (jump != STRCUR_JUMP_NONE) { - const eStrCursorDelimType delim_type = (*pos) < maxlen ? cursor_delim_type_utf8(&str[*pos]) : - STRCUR_DELIM_NONE; + const eStrCursorDelimType delim_type = (*pos) < maxlen ? + cursor_delim_type_utf8(str, maxlen, *pos) : + STRCUR_DELIM_NONE; /* jump between special characters (/,\,_,-, etc.), * look at function cursor_delim_type() for complete * list of special character, ctr -> */ while ((*pos) < maxlen) { if (BLI_str_cursor_step_next_utf8(str, maxlen, pos)) { - if ((jump != STRCUR_JUMP_ALL) && (delim_type != cursor_delim_type_utf8(&str[*pos]))) { + if ((jump != STRCUR_JUMP_ALL) && + (delim_type != cursor_delim_type_utf8(str, maxlen, *pos))) { break; } } @@ -184,7 +189,7 @@ void BLI_str_cursor_step_utf8(const char *str, if (jump != STRCUR_JUMP_NONE) { const eStrCursorDelimType delim_type = (*pos) > 0 ? - cursor_delim_type_utf8(&str[(*pos) - 1]) : + cursor_delim_type_utf8(str, maxlen, *pos - 1) : STRCUR_DELIM_NONE; /* jump between special characters (/,\,_,-, etc.), * look at function cursor_delim_type() for complete @@ -192,7 +197,8 @@ void BLI_str_cursor_step_utf8(const char *str, while ((*pos) > 0) { const int pos_prev = *pos; if (BLI_str_cursor_step_prev_utf8(str, maxlen, pos)) { - if ((jump != STRCUR_JUMP_ALL) && (delim_type != cursor_delim_type_utf8(&str[*pos]))) { + if ((jump != STRCUR_JUMP_ALL) && + (delim_type != cursor_delim_type_utf8(str, maxlen, (size_t)*pos))) { /* left only: compensate for index/change in direction */ if ((pos_orig - (*pos)) >= 1) { *pos = pos_prev; diff --git a/source/blender/blenlib/intern/string_search.cc b/source/blender/blenlib/intern/string_search.cc index 25a13674932..a466c124073 100644 --- a/source/blender/blenlib/intern/string_search.cc +++ b/source/blender/blenlib/intern/string_search.cc @@ -71,12 +71,12 @@ int damerau_levenshtein_distance(StringRef a, StringRef b) for (const int i : IndexRange(size_a)) { v2[0] = (i + 1) * deletion_cost; - const uint32_t unicode_a = BLI_str_utf8_as_unicode_and_size(a.data() + offset_a, &offset_a); + const uint32_t unicode_a = BLI_str_utf8_as_unicode_step(a.data(), a.size(), &offset_a); uint32_t prev_unicode_b; size_t offset_b = 0; for (const int j : IndexRange(size_b)) { - const uint32_t unicode_b = BLI_str_utf8_as_unicode_and_size(b.data() + offset_b, &offset_b); + const uint32_t unicode_b = BLI_str_utf8_as_unicode_step(b.data(), b.size(), &offset_b); /* Check how costly the different operations would be and pick the cheapest - the one with * minimal cost. */ @@ -202,8 +202,8 @@ static bool match_word_initials(StringRef query, int first_found_word_index = -1; while (query_index < query.size()) { - const uint query_unicode = BLI_str_utf8_as_unicode_and_size(query.data() + query_index, - &query_index); + const uint query_unicode = BLI_str_utf8_as_unicode_step( + query.data(), query.size(), &query_index); while (true) { /* We are at the end of words, no complete match has been found yet. */ if (word_index >= words.size()) { @@ -226,8 +226,8 @@ static bool match_word_initials(StringRef query, StringRef word = words[word_index]; /* Try to match the current character with the current word. */ if (static_cast(char_index) < word.size()) { - const uint32_t char_unicode = BLI_str_utf8_as_unicode_and_size(word.data() + char_index, - &char_index); + const uint32_t char_unicode = BLI_str_utf8_as_unicode_step( + word.data(), word.size(), &char_index); if (query_unicode == char_unicode) { r_word_is_matched[word_index] = true; if (first_found_word_index == -1) { @@ -368,8 +368,9 @@ void extract_normalized_words(StringRef str, size_t word_start = 0; size_t offset = 0; while (offset < str_size_in_bytes) { - size_t size = 0; - uint32_t unicode = BLI_str_utf8_as_unicode_and_size(str.data() + offset, &size); + size_t size = offset; + uint32_t unicode = BLI_str_utf8_as_unicode_step(str.data(), str.size(), &size); + size -= offset; if (is_separator(unicode)) { if (is_in_word) { r_words.append( diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c index 06fd3168c24..7a01077bb44 100644 --- a/source/blender/blenlib/intern/string_utf8.c +++ b/source/blender/blenlib/intern/string_utf8.c @@ -546,40 +546,6 @@ uint BLI_str_utf8_as_unicode(const char *p) return result; } -/* variant that increments the length */ -uint BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index) -{ - int i, len; - uint mask = 0; - uint result; - const unsigned char c = (unsigned char)*p; - - UTF8_COMPUTE(c, mask, len, -1); - if (UNLIKELY(len == -1)) { - return BLI_UTF8_ERR; - } - UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); - *index += (size_t)len; - return result; -} - -uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__restrict index) -{ - int i, len; - uint mask = 0; - uint result; - const unsigned char c = (unsigned char)*p; - - UTF8_COMPUTE(c, mask, len, -1); - if (UNLIKELY(len == -1)) { - *index += 1; - return c; - } - UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); - *index += (size_t)len; - return result; -} - /** * UTF8 decoding that steps over the index (unless an error is encountered). * @@ -709,16 +675,23 @@ size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w, memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy); #endif + const size_t src_c_len = strlen(src_c); + const char *src_c_end = src_c + src_c_len; + size_t index = 0; while (*src_c && len != maxlen) { - size_t step = 0; - uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step); + const uint unicode = BLI_str_utf8_as_unicode_step_or_error(src_c, src_c_len, &index); if (unicode != BLI_UTF8_ERR) { *dst_w = unicode; - src_c += step; } else { *dst_w = '?'; - src_c = BLI_str_find_next_char_utf8(src_c, NULL); + const char *src_c_next = BLI_str_find_next_char_utf8(src_c + index, src_c_end); + if (src_c_next != NULL) { + index = (size_t)(src_c_next - src_c); + } + else { + index += 1; + } } dst_w++; len++; @@ -898,7 +871,9 @@ size_t BLI_str_partition_ex_utf8(const char *str, index = 0; *sep >= str && (!end || *sep < end) && **sep != '\0'; *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) { - const uint c = BLI_str_utf8_as_unicode_and_size(*sep, &index); + size_t index_ofs = 0; + const uint c = BLI_str_utf8_as_unicode_step_or_error(*sep, (size_t)(end - *sep), &index_ofs); + index += index_ofs; if (c == BLI_UTF8_ERR) { *suf = *sep = NULL; diff --git a/source/blender/editors/space_text/text_autocomplete.c b/source/blender/editors/space_text/text_autocomplete.c index a38ed12e53b..32e455f3ac9 100644 --- a/source/blender/editors/space_text/text_autocomplete.c +++ b/source/blender/editors/space_text/text_autocomplete.c @@ -177,13 +177,12 @@ static GHash *text_autocomplete_build(Text *text) i_pos = i_start; while ((i_start < linep->len) && (!text_check_identifier_nodigit_unicode( - BLI_str_utf8_as_unicode_and_size_safe(&linep->line[i_start], &i_pos)))) { + BLI_str_utf8_as_unicode_step(linep->line, linep->len, &i_pos)))) { i_start = i_pos; } i_pos = i_end = i_start; - while ((i_end < linep->len) && - (text_check_identifier_unicode( - BLI_str_utf8_as_unicode_and_size_safe(&linep->line[i_end], &i_pos)))) { + while ((i_end < linep->len) && (text_check_identifier_unicode(BLI_str_utf8_as_unicode_step( + linep->line, linep->len, &i_pos)))) { i_end = i_pos; } diff --git a/source/blender/makesrna/intern/rna_wm.c b/source/blender/makesrna/intern/rna_wm.c index b910648495b..21a3c087197 100644 --- a/source/blender/makesrna/intern/rna_wm.c +++ b/source/blender/makesrna/intern/rna_wm.c @@ -649,10 +649,9 @@ static void rna_Event_unicode_get(PointerRNA *ptr, char *value) size_t len = 0; if (event->utf8_buf[0]) { - BLI_str_utf8_as_unicode_and_size(event->utf8_buf, &len); - if (len > 0) { + if (BLI_str_utf8_as_unicode_step_or_error(event->utf8_buf, sizeof(event->utf8_buf), &len) != + BLI_UTF8_ERR) memcpy(value, event->utf8_buf, len); - } } value[len] = '\0'; -- cgit v1.2.3