source/blender/blenlib/BLI_string_utf8.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

/* SPDX-License-Identifier: GPL-2.0-or-later */

#pragma once

/** \file
 * \ingroup bli
 */

#include "BLI_compiler_attrs.h"
#include "BLI_sys_types.h"

#ifdef __cplusplus
extern "C" {
#endif

char *BLI_strncpy_utf8(char *__restrict dst, const char *__restrict src, size_t maxncpy)
    ATTR_NONNULL(1, 2);
size_t BLI_strncpy_utf8_rlen(char *__restrict dst,
                             const char *__restrict src,
                             size_t maxncpy) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1, 2);
/**
 * Find first UTF-8 invalid byte in given \a str, of \a length bytes.
 *
 * \return the offset of the first invalid byte.
 */
ptrdiff_t BLI_str_utf8_invalid_byte(const char *str, size_t length) ATTR_NONNULL(1);
/**
 * Remove any invalid UTF-8 byte (taking into account multi-bytes sequence of course).
 *
 * \return number of stripped bytes.
 */
int BLI_str_utf8_invalid_strip(char *str, size_t length) ATTR_NONNULL(1);

/**
 * \return The size (in bytes) of a single UTF-8 char.
 * \warning Can return -1 on bad chars.
 */
int BLI_str_utf8_size(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);
/**
 * Use when we want to skip errors.
 */
int BLI_str_utf8_size_safe(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);
/**
 * \param p: a pointer to Unicode character encoded as UTF-8
 *
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 * If \a p does not point to a valid UTF-8 encoded character, results are
 * undefined. If you are not sure that the bytes are complete
 * valid Unicode characters, you should use g_utf8_get_char_validated()
 * instead.
 *
 * Return value: the resulting character
 */
unsigned int BLI_str_utf8_as_unicode(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);
/**
 * UTF8 decoding that steps over the index (unless an error is encountered).
 *
 * \param p: The text to step over.
 * \param p_len: The length of `p`.
 * \param index: Index of `p` to step over.
 * \return the code-point `(p + *index)` if there is a decoding error.
 *
 * \note Falls back to `LATIN1` for text drawing.
 */
unsigned int BLI_str_utf8_as_unicode_step(const char *__restrict p,
                                          size_t p_len,
                                          size_t *__restrict index) ATTR_WARN_UNUSED_RESULT
    ATTR_NONNULL(1, 3);
/**
 * UTF8 decoding that steps over the index (unless an error is encountered).
 *
 * \param p: The text to step over.
 * \param p_len: The length of `p`.
 * \param index: Index of `p` to step over.
 * \return the code-point or #BLI_UTF8_ERR if there is a decoding error.
 *
 * \note The behavior for clipped text (where `p_len` limits decoding trailing bytes)
 * must have the same behavior is encountering a nil byte,
 * so functions that only use the first part of a string has matching behavior to functions
 * that null terminate the text.
 */
unsigned int BLI_str_utf8_as_unicode_step_or_error(
    const char *__restrict p, size_t p_len, size_t *__restrict index) ATTR_WARN_UNUSED_RESULT
    ATTR_NONNULL(1, 3);

size_t BLI_str_utf8_from_unicode_len(unsigned int c) ATTR_WARN_UNUSED_RESULT;
/**
 * BLI_str_utf8_from_unicode:
 *
 * \param c: a Unicode character code
 * \param outbuf: output buffer, must have at least `outbuf_len` bytes of space.
 * If the length required by `c` exceeds `outbuf_len`,
 * the bytes available bytes will be zeroed and `outbuf_len` returned.
 *
 * Converts a single character to UTF-8.
 *
 * \return number of bytes written.
 */
size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf, size_t outbuf_len) ATTR_NONNULL(2);
size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
                             const char *__restrict src_c,
                             size_t maxncpy) ATTR_NONNULL(1, 2);
size_t BLI_str_utf32_as_utf8(char *__restrict dst, const char32_t *__restrict src, size_t maxncpy)
    ATTR_NONNULL(1, 2);
/**
 * \return The UTF-32 len in UTF-8.
 */
size_t BLI_str_utf32_as_utf8_len(const char32_t *src) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);

/**
 * BLI_str_find_prev_char_utf8:
 * \param p: pointer to some position within \a str
 * \param str_start: pointer to the beginning of a UTF-8 encoded string
 *
 * Given a position \a p with a UTF-8 encoded string \a str, find the start
 * of the previous UTF-8 character starting before. \a p Returns \a str_start if no
 * UTF-8 characters are present in \a str_start before \a p.
 *
 * \a p does not have to be at the beginning of a UTF-8 character. No check
 * is made to see if the character found is actually valid other than
 * it starts with an appropriate byte.
 *
 * \return A pointer to the found character.
 */
const char *BLI_str_find_prev_char_utf8(const char *p, const char *str_start)
    ATTR_WARN_UNUSED_RESULT ATTR_RETURNS_NONNULL ATTR_NONNULL(1, 2);
/**
 * \param p: a pointer to a position within a UTF-8 encoded string
 * \param str_end: a pointer to the byte following the end of the string.
 *
 * Finds the start of the next UTF-8 character in the string after \a p
 *
 * \a p does not have to be at the beginning of a UTF-8 character. No check
 * is made to see if the character found is actually valid other than
 * it starts with an appropriate byte.
 *
 * \return a pointer to the found character or a pointer to the null terminating character '\0'.
 */
const char *BLI_str_find_next_char_utf8(const char *p, const char *str_end)
    ATTR_WARN_UNUSED_RESULT ATTR_RETURNS_NONNULL ATTR_NONNULL(1, 2);

/**
 * \return the `wchar_t` length in UTF-8.
 */
size_t BLI_wstrlen_utf8(const wchar_t *src) ATTR_NONNULL(1) ATTR_WARN_UNUSED_RESULT;
size_t BLI_strlen_utf8_ex(const char *strc, size_t *r_len_bytes)
    ATTR_NONNULL(1, 2) ATTR_WARN_UNUSED_RESULT;
size_t BLI_strlen_utf8(const char *strc) ATTR_NONNULL(1) ATTR_WARN_UNUSED_RESULT;
size_t BLI_strnlen_utf8_ex(const char *strc, size_t maxlen, size_t *r_len_bytes)
    ATTR_NONNULL(1, 3);
/**
 * \param strc: the string to measure the length.
 * \param maxlen: the string length (in bytes)
 * \return the unicode length (not in bytes!)
 */
size_t BLI_strnlen_utf8(const char *strc, size_t maxlen) ATTR_NONNULL(1) ATTR_WARN_UNUSED_RESULT;
size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst,
                                 const wchar_t *__restrict src,
                                 size_t maxncpy) ATTR_NONNULL(1, 2);
size_t BLI_strncpy_wchar_from_utf8(wchar_t *__restrict dst,
                                   const char *__restrict src,
                                   size_t maxncpy) ATTR_NONNULL(1, 2);

/**
 * Count columns that character/string occupies (based on `wcwidth.co`).
 */
int BLI_wcwidth(char32_t ucs) ATTR_WARN_UNUSED_RESULT;
int BLI_wcswidth(const char32_t *pwcs, size_t n) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);
/**
 * \warning can return -1 on bad chars.
 */
int BLI_str_utf8_char_width(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);
int BLI_str_utf8_char_width_safe(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1);

size_t BLI_str_partition_utf8(const char *str,
                              const unsigned int delim[],
                              const char **sep,
                              const char **suf) ATTR_NONNULL(1, 2, 3, 4);
size_t BLI_str_rpartition_utf8(const char *str,
                               const unsigned int delim[],
                               const char **sep,
                               const char **suf) ATTR_NONNULL(1, 2, 3, 4);
size_t BLI_str_partition_ex_utf8(const char *str,
                                 const char *end,
                                 const unsigned int delim[],
                                 const char **sep,
                                 const char **suf,
                                 bool from_right) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1, 3, 4, 5);

int BLI_str_utf8_offset_to_index(const char *str, int offset) ATTR_WARN_UNUSED_RESULT
    ATTR_NONNULL(1);
int BLI_str_utf8_offset_from_index(const char *str, int index) ATTR_WARN_UNUSED_RESULT
    ATTR_NONNULL(1);
int BLI_str_utf8_offset_to_column(const char *str, int offset) ATTR_WARN_UNUSED_RESULT
    ATTR_NONNULL(1);
int BLI_str_utf8_offset_from_column(const char *str, int column) ATTR_WARN_UNUSED_RESULT
    ATTR_NONNULL(1);

/** Size in bytes. */
#define BLI_UTF8_MAX 6
#define BLI_UTF8_WIDTH_MAX 2 /* columns */
#define BLI_UTF8_ERR ((unsigned int)-1)

/* -------------------------------------------------------------------- */
/** \name String Copy/Format Macros
 * Avoid repeating destination with `sizeof(..)`.
 * \note `ARRAY_SIZE` allows pointers on some platforms.
 * \{ */

#define STRNCPY_UTF8(dst, src) BLI_strncpy_utf8(dst, src, ARRAY_SIZE(dst))
#define STRNCPY_UTF8_RLEN(dst, src) BLI_strncpy_utf8_rlen(dst, src, ARRAY_SIZE(dst))

/** \} */

#ifdef __cplusplus
}
#endif