Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'source/blender/blenlib/intern/string_utf8.c')
-rw-r--r--source/blender/blenlib/intern/string_utf8.c140
1 files changed, 54 insertions, 86 deletions
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index dbde5221d7e..e35e2bcca3c 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -297,8 +297,8 @@ size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst,
const size_t maxncpy)
{
const size_t maxlen = maxncpy - 1;
- /* 6 is max utf8 length of an unicode char. */
- const int64_t maxlen_secured = (int64_t)maxlen - 6;
+ /* #BLI_UTF8_MAX is max utf8 length of an unicode char. */
+ const int64_t maxlen_secured = (int64_t)maxlen - BLI_UTF8_MAX;
size_t len = 0;
BLI_assert(maxncpy != 0);
@@ -314,9 +314,9 @@ size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst,
/* We have to be more careful for the last six bytes,
* to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */
while (*src) {
- char t[6];
+ char t[BLI_UTF8_MAX];
size_t l = BLI_str_utf8_from_unicode((uint)*src++, t);
- BLI_assert(l <= 6);
+ BLI_assert(l <= BLI_UTF8_MAX);
if (len + l > maxlen) {
break;
}
@@ -546,104 +546,63 @@ uint BLI_str_utf8_as_unicode(const char *p)
return result;
}
-/* variant that increments the length */
-uint BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index)
+/**
+ * UTF8 decoding that steps over the index (unless an error is encountered).
+ *
+ * \param p: The text to step over.
+ * \param p_len: The length of `p`.
+ * \param index: Index of `p` to step over.
+ * \return the code-point or #BLI_UTF8_ERR if there is a decoding error.
+ *
+ * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes)
+ * must have the same behavior is encountering a nil byte,
+ * so functions that only use the first part of a string has matching behavior to functions
+ * that null terminate the text.
+ */
+uint BLI_str_utf8_as_unicode_step_or_error(const char *__restrict p,
+ const size_t p_len,
+ size_t *__restrict index)
{
int i, len;
uint mask = 0;
uint result;
- const unsigned char c = (unsigned char)*p;
+ const unsigned char c = (unsigned char)*(p += *index);
+
+ BLI_assert(*index < p_len);
+ BLI_assert(c != '\0');
UTF8_COMPUTE(c, mask, len, -1);
- if (UNLIKELY(len == -1)) {
+ if (UNLIKELY(len == -1) || (*index + (size_t)len > p_len)) {
return BLI_UTF8_ERR;
}
UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
- *index += (size_t)len;
- return result;
-}
-
-uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__restrict index)
-{
- int i, len;
- uint mask = 0;
- uint result;
- const unsigned char c = (unsigned char)*p;
-
- UTF8_COMPUTE(c, mask, len, -1);
- if (UNLIKELY(len == -1)) {
- *index += 1;
- return c;
+ if (UNLIKELY(result == BLI_UTF8_ERR)) {
+ return BLI_UTF8_ERR;
}
- UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
*index += (size_t)len;
+ BLI_assert(*index <= p_len);
return result;
}
/**
- * Another variant that steps over the index.
+ * UTF8 decoding that steps over the index (unless an error is encountered).
*
* \param p: The text to step over.
* \param p_len: The length of `p`.
* \param index: Index of `p` to step over.
+ * \return the code-point `(p + *index)` if there is a decoding error.
*
- * \note currently this also falls back to latin1 for text drawing.
- *
- * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes)
- * must have the same behavior is encountering a nil byte,
- * so functions that only use the first part of a string has matching behavior to functions
- * that null terminate the text.
+ * \note Falls back to `LATIN1` for text drawing.
*/
uint BLI_str_utf8_as_unicode_step(const char *__restrict p,
const size_t p_len,
size_t *__restrict index)
{
- int i, len;
- uint mask = 0;
- uint result;
- const char c = p[*index];
-
- BLI_assert(*index < p_len);
- BLI_assert(c != '\0');
-
- UTF8_COMPUTE(c, mask, len, -1);
- if (UNLIKELY(len == -1)) {
- const char *p_next = BLI_str_find_next_char_utf8(p + *index, p + p_len);
- /* #BLI_str_find_next_char_utf8 ensures the nil byte will terminate.
- * so there is no chance this sets the index past the nil byte (assert this is the case). */
- BLI_assert(p_next || (memchr(p + *index, '\0', p_len - *index) == NULL));
- len = (int)((p_next ? (size_t)(p_next - p) : p_len) - *index);
- result = BLI_UTF8_ERR;
- }
- else if (UNLIKELY(*index + (size_t)len > p_len)) {
- /* A multi-byte character reads past the buffer bounds,
- * match the behavior of encountering an byte with invalid encoding below. */
- len = 1;
- result = (uint)c;
- }
- else {
- /* This is tricky since there are a few ways we can bail out of bad unicode
- * values, 3 possible solutions. */
- p += *index;
-#if 0
- UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
-#elif 1
- /* WARNING: this is NOT part of glib, or supported by similar functions.
- * this is added for text drawing because some filepaths can have latin1
- * characters */
- UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
- if (result == BLI_UTF8_ERR) {
- len = 1;
- result = (uint)c;
- }
- /* end warning! */
-#else
- /* Without a fallback like '?', text drawing will stop on this value. */
- UTF8_GET(result, p, i, mask, len, '?');
-#endif
+ uint result = BLI_str_utf8_as_unicode_step_or_error(p, p_len, index);
+ if (UNLIKELY(result == BLI_UTF8_ERR)) {
+ result = (uint)p[*index];
+ *index += 1;
}
-
- *index += (size_t)len;
BLI_assert(*index <= p_len);
return result;
}
@@ -716,16 +675,23 @@ size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy);
#endif
- while (*src_c && len != maxlen) {
- size_t step = 0;
- uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step);
+ const size_t src_c_len = strlen(src_c);
+ const char *src_c_end = src_c + src_c_len;
+ size_t index = 0;
+ while ((index < src_c_len) && (len != maxlen)) {
+ const uint unicode = BLI_str_utf8_as_unicode_step_or_error(src_c, src_c_len, &index);
if (unicode != BLI_UTF8_ERR) {
*dst_w = unicode;
- src_c += step;
}
else {
*dst_w = '?';
- src_c = BLI_str_find_next_char_utf8(src_c, NULL);
+ const char *src_c_next = BLI_str_find_next_char_utf8(src_c + index, src_c_end);
+ if (src_c_next != NULL) {
+ index = (size_t)(src_c_next - src_c);
+ }
+ else {
+ index += 1;
+ }
}
dst_w++;
len++;
@@ -741,8 +707,8 @@ size_t BLI_str_utf32_as_utf8(char *__restrict dst,
const size_t maxncpy)
{
const size_t maxlen = maxncpy - 1;
- /* 6 is max utf8 length of an unicode char. */
- const int64_t maxlen_secured = (int64_t)maxlen - 6;
+ /* #BLI_UTF8_MAX is max utf8 length of an unicode char. */
+ const int64_t maxlen_secured = (int64_t)maxlen - BLI_UTF8_MAX;
size_t len = 0;
BLI_assert(maxncpy != 0);
@@ -758,9 +724,9 @@ size_t BLI_str_utf32_as_utf8(char *__restrict dst,
/* We have to be more careful for the last six bytes,
* to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */
while (*src) {
- char t[6];
+ char t[BLI_UTF8_MAX];
size_t l = BLI_str_utf8_from_unicode((uint)*src++, t);
- BLI_assert(l <= 6);
+ BLI_assert(l <= BLI_UTF8_MAX);
if (len + l > maxlen) {
break;
}
@@ -905,7 +871,9 @@ size_t BLI_str_partition_ex_utf8(const char *str,
index = 0;
*sep >= str && (!end || *sep < end) && **sep != '\0';
*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) {
- const uint c = BLI_str_utf8_as_unicode_and_size(*sep, &index);
+ size_t index_ofs = 0;
+ const uint c = BLI_str_utf8_as_unicode_step_or_error(*sep, (size_t)(end - *sep), &index_ofs);
+ index += index_ofs;
if (c == BLI_UTF8_ERR) {
*suf = *sep = NULL;