From 522cc87fdc25449222a5894a428eebf4b8d5eaa9 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Dec 2022 15:46:53 +0100 Subject: utf8: fix truncated string lengths in `utf8_strnwidth()` The `utf8_strnwidth()` function accepts an optional string length as input parameter. This parameter can either be set to `-1`, in which case we call `strlen()` on the input. Or it can be set to a positive integer that indicates a precomputed length, which callers typically compute by calling `strlen()` at some point themselves. The input parameter is an `int` though, whereas `strlen()` returns a `size_t`. This can lead to implementation-defined behaviour though when the `size_t` cannot be represented by the `int`. In the general case though this leads to wrap-around and thus to negative string sizes, which is sure enough to not lead to well-defined behaviour. Fix this by accepting a `size_t` instead of an `int` as string length. While this takes away the ability of callers to simply pass in `-1` as string length, it really is trivial enough to convert them to instead pass in `strlen()` instead. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- utf8.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index 5b39361ada..504e517c34 100644 --- a/utf8.c +++ b/utf8.c @@ -206,13 +206,11 @@ int utf8_width(const char **start, size_t *remainder_p) * string, assuming that the string is utf8. Returns strlen() instead * if the string does not look like a valid utf8 string. */ -int utf8_strnwidth(const char *string, int len, int skip_ansi) +int utf8_strnwidth(const char *string, size_t len, int skip_ansi) { int width = 0; const char *orig = string; - if (len == -1) - len = strlen(string); while (string && string < orig + len) { int skip; while (skip_ansi && @@ -225,7 +223,7 @@ int utf8_strnwidth(const char *string, int len, int skip_ansi) int utf8_strwidth(const char *string) { - return utf8_strnwidth(string, -1, 0); + return utf8_strnwidth(string, strlen(string), 0); } int is_utf8(const char *text) @@ -791,7 +789,7 @@ int skip_utf8_bom(char **text, size_t len) void strbuf_utf8_align(struct strbuf *buf, align_type position, unsigned int width, const char *s) { - int slen = strlen(s); + size_t slen = strlen(s); int display_len = utf8_strnwidth(s, slen, 0); int utf8_compensation = slen - display_len; -- cgit v1.2.3 From 17d23e8a3812a5ca3dd6564e74d5250f22e5d76d Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Dec 2022 15:47:00 +0100 Subject: utf8: fix returning negative string width The `utf8_strnwidth()` function calls `utf8_width()` in a loop and adds its returned width to the end result. `utf8_width()` can return `-1` though in case it reads a control character, which means that the computed string width is going to be wrong. In the worst case where there are more control characters than non-control characters, we may even return a negative string width. Fix this bug by treating control characters as having zero width. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- utf8.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index 504e517c34..6a21fd6a7b 100644 --- a/utf8.c +++ b/utf8.c @@ -212,11 +212,15 @@ int utf8_strnwidth(const char *string, size_t len, int skip_ansi) const char *orig = string; while (string && string < orig + len) { - int skip; + int glyph_width, skip; + while (skip_ansi && (skip = display_mode_esc_sequence_len(string)) != 0) string += skip; - width += utf8_width(&string, NULL); + + glyph_width = utf8_width(&string, NULL); + if (glyph_width > 0) + width += glyph_width; } return string ? width : len; } -- cgit v1.2.3 From 937b71cc8b5b998963a7f9a33312ba3549d55510 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Dec 2022 15:47:04 +0100 Subject: utf8: fix overflow when returning string width The return type of both `utf8_strwidth()` and `utf8_strnwidth()` is `int`, but we operate on string lengths which are typically of type `size_t`. This means that when the string is longer than `INT_MAX`, we will overflow and thus return a negative result. This can lead to an out-of-bounds write with `--pretty=format:%<1)%B` and a commit message that is 2^31+1 bytes long: ================================================================= ==26009==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x603000001168 at pc 0x7f95c4e5f427 bp 0x7ffd8541c900 sp 0x7ffd8541c0a8 WRITE of size 2147483649 at 0x603000001168 thread T0 #0 0x7f95c4e5f426 in __interceptor_memcpy /usr/src/debug/gcc/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:827 #1 0x5612bbb1068c in format_and_pad_commit pretty.c:1763 #2 0x5612bbb1087a in format_commit_item pretty.c:1801 #3 0x5612bbc33bab in strbuf_expand strbuf.c:429 #4 0x5612bbb110e7 in repo_format_commit_message pretty.c:1869 #5 0x5612bbb12d96 in pretty_print_commit pretty.c:2161 #6 0x5612bba0a4d5 in show_log log-tree.c:781 #7 0x5612bba0d6c7 in log_tree_commit log-tree.c:1117 #8 0x5612bb691ed5 in cmd_log_walk_no_free builtin/log.c:508 #9 0x5612bb69235b in cmd_log_walk builtin/log.c:549 #10 0x5612bb6951a2 in cmd_log builtin/log.c:883 #11 0x5612bb56c993 in run_builtin git.c:466 #12 0x5612bb56d397 in handle_builtin git.c:721 #13 0x5612bb56db07 in run_argv git.c:788 #14 0x5612bb56e8a7 in cmd_main git.c:923 #15 0x5612bb803682 in main common-main.c:57 #16 0x7f95c4c3c28f (/usr/lib/libc.so.6+0x2328f) #17 0x7f95c4c3c349 in __libc_start_main (/usr/lib/libc.so.6+0x23349) #18 0x5612bb5680e4 in _start ../sysdeps/x86_64/start.S:115 0x603000001168 is located 0 bytes to the right of 24-byte region [0x603000001150,0x603000001168) allocated by thread T0 here: #0 0x7f95c4ebe7ea in __interceptor_realloc /usr/src/debug/gcc/libsanitizer/asan/asan_malloc_linux.cpp:85 #1 0x5612bbcdd556 in xrealloc wrapper.c:136 #2 0x5612bbc310a3 in strbuf_grow strbuf.c:99 #3 0x5612bbc32acd in strbuf_add strbuf.c:298 #4 0x5612bbc33aec in strbuf_expand strbuf.c:418 #5 0x5612bbb110e7 in repo_format_commit_message pretty.c:1869 #6 0x5612bbb12d96 in pretty_print_commit pretty.c:2161 #7 0x5612bba0a4d5 in show_log log-tree.c:781 #8 0x5612bba0d6c7 in log_tree_commit log-tree.c:1117 #9 0x5612bb691ed5 in cmd_log_walk_no_free builtin/log.c:508 #10 0x5612bb69235b in cmd_log_walk builtin/log.c:549 #11 0x5612bb6951a2 in cmd_log builtin/log.c:883 #12 0x5612bb56c993 in run_builtin git.c:466 #13 0x5612bb56d397 in handle_builtin git.c:721 #14 0x5612bb56db07 in run_argv git.c:788 #15 0x5612bb56e8a7 in cmd_main git.c:923 #16 0x5612bb803682 in main common-main.c:57 #17 0x7f95c4c3c28f (/usr/lib/libc.so.6+0x2328f) SUMMARY: AddressSanitizer: heap-buffer-overflow /usr/src/debug/gcc/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:827 in __interceptor_memcpy Shadow bytes around the buggy address: 0x0c067fff81d0: fd fd fd fa fa fa fd fd fd fa fa fa fd fd fd fa 0x0c067fff81e0: fa fa fd fd fd fd fa fa fd fd fd fd fa fa fd fd 0x0c067fff81f0: fd fa fa fa fd fd fd fa fa fa fd fd fd fa fa fa 0x0c067fff8200: fd fd fd fa fa fa fd fd fd fd fa fa 00 00 00 fa 0x0c067fff8210: fa fa fd fd fd fa fa fa fd fd fd fa fa fa fd fd =>0x0c067fff8220: fd fa fa fa fd fd fd fa fa fa 00 00 00[fa]fa fa 0x0c067fff8230: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0c067fff8240: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0c067fff8250: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0c067fff8260: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0c067fff8270: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa Shadow byte legend (one shadow byte represents 8 application bytes): Addressable: 00 Partially addressable: 01 02 03 04 05 06 07 Heap left redzone: fa Freed heap region: fd Stack left redzone: f1 Stack mid redzone: f2 Stack right redzone: f3 Stack after return: f5 Stack use after scope: f8 Global redzone: f9 Global init order: f6 Poisoned by user: f7 Container overflow: fc Array cookie: ac Intra object redzone: bb ASan internal: fe Left alloca redzone: ca Right alloca redzone: cb ==26009==ABORTING Now the proper fix for this would be to convert both functions to return an `size_t` instead of an `int`. But given that this commit may be part of a security release, let's instead do the minimal viable fix and die in case we see an overflow. Add a test that would have previously caused us to crash. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- utf8.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index 6a21fd6a7b..30c7787cfa 100644 --- a/utf8.c +++ b/utf8.c @@ -208,11 +208,12 @@ int utf8_width(const char **start, size_t *remainder_p) */ int utf8_strnwidth(const char *string, size_t len, int skip_ansi) { - int width = 0; const char *orig = string; + size_t width = 0; while (string && string < orig + len) { - int glyph_width, skip; + int glyph_width; + size_t skip; while (skip_ansi && (skip = display_mode_esc_sequence_len(string)) != 0) @@ -222,7 +223,12 @@ int utf8_strnwidth(const char *string, size_t len, int skip_ansi) if (glyph_width > 0) width += glyph_width; } - return string ? width : len; + + /* + * TODO: fix the interface of this function and `utf8_strwidth()` to + * return `size_t` instead of `int`. + */ + return cast_size_t_to_int(string ? width : len); } int utf8_strwidth(const char *string) -- cgit v1.2.3 From 81c2d4c3a5ba0e6ab8c348708441fed170e63a82 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Dec 2022 15:47:10 +0100 Subject: utf8: fix checking for glyph width in `strbuf_utf8_replace()` In `strbuf_utf8_replace()`, we call `utf8_width()` to compute the width of the current glyph. If the glyph is a control character though it can be that `utf8_width()` returns `-1`, but because we assign this value to a `size_t` the conversion will cause us to underflow. This bug can easily be triggered with the following command: $ git log --pretty='format:xxx%<|(1,trunc)%x10' >From all I can see though this seems to be a benign underflow that has no security-related consequences. Fix the bug by using an `int` instead. When we see a control character, we now copy it into the target buffer but don't advance the current width of the string. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- utf8.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index 30c7787cfa..077daf4b20 100644 --- a/utf8.c +++ b/utf8.c @@ -377,6 +377,7 @@ void strbuf_utf8_replace(struct strbuf *sb_src, int pos, int width, dst = sb_dst.buf; while (src < end) { + int glyph_width; char *old; size_t n; @@ -390,21 +391,29 @@ void strbuf_utf8_replace(struct strbuf *sb_src, int pos, int width, break; old = src; - n = utf8_width((const char**)&src, NULL); - if (!src) /* broken utf-8, do nothing */ + glyph_width = utf8_width((const char**)&src, NULL); + if (!src) /* broken utf-8, do nothing */ goto out; - if (n && w >= pos && w < pos + width) { + + /* + * In case we see a control character we copy it into the + * buffer, but don't add it to the width. + */ + if (glyph_width < 0) + glyph_width = 0; + + if (glyph_width && w >= pos && w < pos + width) { if (subst) { memcpy(dst, subst, subst_len); dst += subst_len; subst = NULL; } - w += n; + w += glyph_width; continue; } memcpy(dst, old, src - old); dst += src - old; - w += n; + w += glyph_width; } strbuf_setlen(&sb_dst, dst - sb_dst.buf); strbuf_swap(sb_src, &sb_dst); -- cgit v1.2.3 From f930a2394303b902e2973f4308f96529f736b8bc Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 1 Dec 2022 15:47:15 +0100 Subject: utf8: refactor `strbuf_utf8_replace` to not rely on preallocated buffer In `strbuf_utf8_replace`, we preallocate the destination buffer and then use `memcpy` to copy bytes into it at computed offsets. This feels rather fragile and is hard to understand at times. Refactor the code to instead use `strbuf_add` and `strbuf_addstr` so that we can be sure that there is no possibility to perform an out-of-bounds write. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- utf8.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index 077daf4b20..d8a16af87c 100644 --- a/utf8.c +++ b/utf8.c @@ -365,26 +365,20 @@ void strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len, void strbuf_utf8_replace(struct strbuf *sb_src, int pos, int width, const char *subst) { - struct strbuf sb_dst = STRBUF_INIT; - char *src = sb_src->buf; - char *end = src + sb_src->len; - char *dst; - int w = 0, subst_len = 0; + const char *src = sb_src->buf, *end = sb_src->buf + sb_src->len; + struct strbuf dst; + int w = 0; - if (subst) - subst_len = strlen(subst); - strbuf_grow(&sb_dst, sb_src->len + subst_len); - dst = sb_dst.buf; + strbuf_init(&dst, sb_src->len); while (src < end) { + const char *old; int glyph_width; - char *old; size_t n; while ((n = display_mode_esc_sequence_len(src))) { - memcpy(dst, src, n); + strbuf_add(&dst, src, n); src += n; - dst += n; } if (src >= end) @@ -404,21 +398,19 @@ void strbuf_utf8_replace(struct strbuf *sb_src, int pos, int width, if (glyph_width && w >= pos && w < pos + width) { if (subst) { - memcpy(dst, subst, subst_len); - dst += subst_len; + strbuf_addstr(&dst, subst); subst = NULL; } - w += glyph_width; - continue; + } else { + strbuf_add(&dst, old, src - old); } - memcpy(dst, old, src - old); - dst += src - old; + w += glyph_width; } - strbuf_setlen(&sb_dst, dst - sb_dst.buf); - strbuf_swap(sb_src, &sb_dst); + + strbuf_swap(sb_src, &dst); out: - strbuf_release(&sb_dst); + strbuf_release(&dst); } /* -- cgit v1.2.3