From 52acddf36c8cb3778ab2098a0d95cc2e375a4069 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 24 Apr 2023 18:20:10 -0400 Subject: string-list: multi-delimiter `string_list_split_in_place()` Enhance `string_list_split_in_place()` to accept multiple characters as delimiters instead of a single character. Instead of using `strchr(2)` to locate the first occurrence of the given delimiter character, `string_list_split_in_place_multi()` uses `strcspn(2)` to move past the initial segment of characters comprised of any characters in the delimiting set. When only a single delimiting character is provided, `strpbrk(2)` (which is implemented with `strcspn(2)`) has equivalent performance to `strchr(2)`. Modern `strcspn(2)` implementations treat an empty delimiter or the singleton delimiter as a special case and fall back to calling strchrnul(). Both glibc[1] and musl[2] implement `strcspn(2)` this way. This change is one step to removing `strtok(2)` from the tree. Note that `string_list_split_in_place()` is not a strict replacement for `strtok()`, since it will happily turn sequential delimiter characters into empty entries in the resulting string_list. For example: string_list_split_in_place(&xs, "foo:;:bar:;:baz", ":;", -1) would yield a string list of: ["foo", "", "", "bar", "", "", "baz"] Callers that wish to emulate the behavior of strtok(2) more directly should call `string_list_remove_empty_items()` after splitting. To avoid regressions for the new multi-character delimter cases, update t0063 in this patch as well. [1]: https://sourceware.org/git/?p=glibc.git;a=blob;f=string/strcspn.c;hb=glibc-2.37#l35 [2]: https://git.musl-libc.org/cgit/musl/tree/src/string/strcspn.c?h=v1.2.3#n11 Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/helper/test-string-list.c | 4 ++-- t/t0063-string-list.sh | 51 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) (limited to 't') diff --git a/t/helper/test-string-list.c b/t/helper/test-string-list.c index 2123dda85b..63df88575c 100644 --- a/t/helper/test-string-list.c +++ b/t/helper/test-string-list.c @@ -62,7 +62,7 @@ int cmd__string_list(int argc, const char **argv) struct string_list list = STRING_LIST_INIT_NODUP; int i; char *s = xstrdup(argv[2]); - int delim = *argv[3]; + const char *delim = argv[3]; int maxsplit = atoi(argv[4]); i = string_list_split_in_place(&list, s, delim, maxsplit); @@ -111,7 +111,7 @@ int cmd__string_list(int argc, const char **argv) */ if (sb.len && sb.buf[sb.len - 1] == '\n') strbuf_setlen(&sb, sb.len - 1); - string_list_split_in_place(&list, sb.buf, '\n', -1); + string_list_split_in_place(&list, sb.buf, "\n", -1); string_list_sort(&list); diff --git a/t/t0063-string-list.sh b/t/t0063-string-list.sh index 46d4839194..1fee6d9010 100755 --- a/t/t0063-string-list.sh +++ b/t/t0063-string-list.sh @@ -18,6 +18,14 @@ test_split () { " } +test_split_in_place() { + cat >expected && + test_expect_success "split (in place) $1 at $2, max $3" " + test-tool string-list split_in_place '$1' '$2' '$3' >actual && + test_cmp expected actual + " +} + test_split "foo:bar:baz" ":" "-1" <