From 311bfe18ce8563d0b1b571e9dc9eb35b97de3790 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 10 Jul 2023 17:12:10 -0400 Subject: ref-filter: clear reachable list pointers after freeing In `reach_filter()`, we pop all commits from the reachable lists, leaving them empty. But because we're operating on a list pointer that was passed by value, the original `filter.reachable_from` and `filter.unreachable_from` pointers are left dangling. As is the case with the previous commit, nobody touches either of these fields after calling `reach_filter()`, so leaving them dangling is OK. But this future proofs against dangerous situations. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ref-filter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'ref-filter.c') diff --git a/ref-filter.c b/ref-filter.c index 4991cd4f7a..048d277cbf 100644 --- a/ref-filter.c +++ b/ref-filter.c @@ -2418,13 +2418,13 @@ void ref_array_clear(struct ref_array *array) #define EXCLUDE_REACHED 0 #define INCLUDE_REACHED 1 static void reach_filter(struct ref_array *array, - struct commit_list *check_reachable, + struct commit_list **check_reachable, int include_reached) { int i, old_nr; struct commit **to_clear; - if (!check_reachable) + if (!*check_reachable) return; CALLOC_ARRAY(to_clear, array->nr); @@ -2434,7 +2434,7 @@ static void reach_filter(struct ref_array *array, } tips_reachable_from_bases(the_repository, - check_reachable, + *check_reachable, to_clear, array->nr, UNINTERESTING); @@ -2455,8 +2455,8 @@ static void reach_filter(struct ref_array *array, clear_commit_marks_many(old_nr, to_clear, ALL_REV_FLAGS); - while (check_reachable) { - struct commit *merge_commit = pop_commit(&check_reachable); + while (*check_reachable) { + struct commit *merge_commit = pop_commit(check_reachable); clear_commit_marks(merge_commit, ALL_REV_FLAGS); } @@ -2553,8 +2553,8 @@ int filter_refs(struct ref_array *array, struct ref_filter *filter, unsigned int clear_contains_cache(&ref_cbdata.no_contains_cache); /* Filters that need revision walking */ - reach_filter(array, filter->reachable_from, INCLUDE_REACHED); - reach_filter(array, filter->unreachable_from, EXCLUDE_REACHED); + reach_filter(array, &filter->reachable_from, INCLUDE_REACHED); + reach_filter(array, &filter->unreachable_from, EXCLUDE_REACHED); save_commit_buffer = save_commit_buffer_orig; return ret; -- cgit v1.2.3 From b571fb98008b485bfc6f7d6538b79a7e92d731f4 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 10 Jul 2023 17:12:13 -0400 Subject: ref-filter: add `ref_filter_clear()` We did not bother to clean up at all in `git branch` or `git tag`, and `git for-each-ref` only cleans up a couple of members. Add and call `ref_filter_clear()` when cleaning up a `struct ref_filter`. Running this patch (without any test changes) indicates a couple of now leak-free tests. This was found by running: $ make SANITIZE=leak $ make -C t GIT_TEST_PASSING_SANITIZE_LEAK=check GIT_TEST_OPTS=--immediate (Note that the `reachable_from` and `unreachable_from` lists should be cleaned as they are used. So this is just covering any case where we might bail before running the reachability check.) Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ref-filter.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'ref-filter.c') diff --git a/ref-filter.c b/ref-filter.c index 048d277cbf..d32f426898 100644 --- a/ref-filter.c +++ b/ref-filter.c @@ -2866,3 +2866,19 @@ int parse_opt_merge_filter(const struct option *opt, const char *arg, int unset) return 0; } + +void ref_filter_init(struct ref_filter *filter) +{ + struct ref_filter blank = REF_FILTER_INIT; + memcpy(filter, &blank, sizeof(blank)); +} + +void ref_filter_clear(struct ref_filter *filter) +{ + oid_array_clear(&filter->points_at); + free_commit_list(filter->with_commit); + free_commit_list(filter->no_commit); + free_commit_list(filter->reachable_from); + free_commit_list(filter->unreachable_from); + ref_filter_init(filter); +} -- cgit v1.2.3 From 284c55deb53921457929ca2afe231c48e6cada85 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 10 Jul 2023 17:12:16 -0400 Subject: ref-filter.c: parameterize match functions over patterns `match_pattern()` and `match_name_as_path()` both take a `struct ref_filter *`, and then store a stack variable `patterns` pointing at `filter->patterns`. The subsequent patch will add a new array of patterns to match over (the excluded patterns, via a new `git for-each-ref --exclude` option), treating the return value of these functions differently depending on which patterns are being used to match. Tweak `match_pattern()` and `match_name_as_path()` to take an array of patterns to prepare for passing either in. Once we start passing either in, `match_pattern()` will have little to do with a particular `struct ref_filter *` instance. To clarify this, drop it from the argument list, and replace it with the only bit of the `ref_filter` that we care about (`filter->ignore_case`). Co-authored-by: Taylor Blau Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ref-filter.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'ref-filter.c') diff --git a/ref-filter.c b/ref-filter.c index d32f426898..91acf53ef9 100644 --- a/ref-filter.c +++ b/ref-filter.c @@ -2104,12 +2104,12 @@ static int get_ref_atom_value(struct ref_array_item *ref, int atom, * matches a pattern "refs/heads/mas") or a wildcard (e.g. the same ref * matches "refs/heads/mas*", too). */ -static int match_pattern(const struct ref_filter *filter, const char *refname) +static int match_pattern(const char **patterns, const char *refname, + int ignore_case) { - const char **patterns = filter->name_patterns; unsigned flags = 0; - if (filter->ignore_case) + if (ignore_case) flags |= WM_CASEFOLD; /* @@ -2134,13 +2134,13 @@ static int match_pattern(const struct ref_filter *filter, const char *refname) * matches a pattern "refs/heads/" but not "refs/heads/m") or a * wildcard (e.g. the same ref matches "refs/heads/m*", too). */ -static int match_name_as_path(const struct ref_filter *filter, const char *refname) +static int match_name_as_path(const char **pattern, const char *refname, + int ignore_case) { - const char **pattern = filter->name_patterns; int namelen = strlen(refname); unsigned flags = WM_PATHNAME; - if (filter->ignore_case) + if (ignore_case) flags |= WM_CASEFOLD; for (; *pattern; pattern++) { @@ -2165,8 +2165,10 @@ static int filter_pattern_match(struct ref_filter *filter, const char *refname) if (!*filter->name_patterns) return 1; /* No pattern always matches */ if (filter->match_as_path) - return match_name_as_path(filter, refname); - return match_pattern(filter, refname); + return match_name_as_path(filter->name_patterns, refname, + filter->ignore_case); + return match_pattern(filter->name_patterns, refname, + filter->ignore_case); } /* -- cgit v1.2.3 From 8255dd8a3dce094129690d17595dc822171d1e61 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 10 Jul 2023 17:12:19 -0400 Subject: builtin/for-each-ref.c: add `--exclude` option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using `for-each-ref`, it is sometimes convenient for the caller to be able to exclude certain parts of the references. For example, if there are many `refs/__hidden__/*` references, the caller may want to emit all references *except* the hidden ones. Currently, the only way to do this is to post-process the output, like: $ git for-each-ref --format='%(refname)' | grep -v '^refs/hidden/' Which is do-able, but requires processing a potentially large quantity of references. Teach `git for-each-ref` a new `--exclude=` option, which excludes references from the results if they match one or more excluded patterns. This patch provides a naive implementation where the `ref_filter` still sees all references (including ones that it will discard) and is left to check whether each reference matches any excluded pattern(s) before emitting them. By culling out references we know the caller doesn't care about, we can avoid allocating memory for their storage, as well as spending time sorting the output (among other things). Even the naive implementation provides a significant speed-up on a modified copy of linux.git (that has a hidden ref pointing at each commit): $ hyperfine \ 'git.compile for-each-ref --format="%(objectname) %(refname)" | grep -vE "[0-9a-f]{40} refs/pull/"' \ 'git.compile for-each-ref --format="%(objectname) %(refname)" --exclude refs/pull/' Benchmark 1: git.compile for-each-ref --format="%(objectname) %(refname)" | grep -vE "[0-9a-f]{40} refs/pull/" Time (mean ± σ): 820.1 ms ± 2.0 ms [User: 703.7 ms, System: 152.0 ms] Range (min … max): 817.7 ms … 823.3 ms 10 runs Benchmark 2: git.compile for-each-ref --format="%(objectname) %(refname)" --exclude refs/pull/ Time (mean ± σ): 106.6 ms ± 1.1 ms [User: 99.4 ms, System: 7.1 ms] Range (min … max): 104.7 ms … 109.1 ms 27 runs Summary 'git.compile for-each-ref --format="%(objectname) %(refname)" --exclude refs/pull/' ran 7.69 ± 0.08 times faster than 'git.compile for-each-ref --format="%(objectname) %(refname)" | grep -vE "[0-9a-f]{40} refs/pull/"' Subsequent patches will improve on this by avoiding visiting excluded sections of the `packed-refs` file in certain cases. Co-authored-by: Jeff King Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ref-filter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'ref-filter.c') diff --git a/ref-filter.c b/ref-filter.c index 91acf53ef9..ec9b79c918 100644 --- a/ref-filter.c +++ b/ref-filter.c @@ -2171,6 +2171,16 @@ static int filter_pattern_match(struct ref_filter *filter, const char *refname) filter->ignore_case); } +static int filter_exclude_match(struct ref_filter *filter, const char *refname) +{ + if (!filter->exclude.nr) + return 0; + if (filter->match_as_path) + return match_name_as_path(filter->exclude.v, refname, + filter->ignore_case); + return match_pattern(filter->exclude.v, refname, filter->ignore_case); +} + /* * This is the same as for_each_fullref_in(), but it tries to iterate * only over the patterns we'll care about. Note that it _doesn't_ do a full @@ -2338,6 +2348,9 @@ static int ref_filter_handler(const char *refname, const struct object_id *oid, if (!filter_pattern_match(filter, refname)) return 0; + if (filter_exclude_match(filter, refname)) + return 0; + if (filter->points_at.nr && !match_points_at(&filter->points_at, oid, refname)) return 0; @@ -2877,6 +2890,7 @@ void ref_filter_init(struct ref_filter *filter) void ref_filter_clear(struct ref_filter *filter) { + strvec_clear(&filter->exclude); oid_array_clear(&filter->points_at); free_commit_list(filter->with_commit); free_commit_list(filter->no_commit); -- cgit v1.2.3 From b269ac53c07aa46d5a88d05dac8216d189e69a50 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 10 Jul 2023 17:12:22 -0400 Subject: refs: plumb `exclude_patterns` argument throughout The subsequent patch will want to access an optional `excluded_patterns` array within `refs/packed-backend.c` that will cull out certain references matching any of the given patterns on a best-effort basis. To do so, the refs subsystem needs to be updated to pass this value across a number of different locations. Prepare for a future patch by introducing this plumbing now, passing NULLs at top-level APIs in order to make that patch less noisy and more easily readable. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ref-filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'ref-filter.c') diff --git a/ref-filter.c b/ref-filter.c index ec9b79c918..c72d016bcc 100644 --- a/ref-filter.c +++ b/ref-filter.c @@ -2215,7 +2215,7 @@ static int for_each_fullref_in_pattern(struct ref_filter *filter, return refs_for_each_fullref_in_prefixes(get_main_ref_store(the_repository), NULL, filter->name_patterns, - cb, cb_data); + NULL, cb, cb_data); } /* -- cgit v1.2.3 From 59c35fac54054b3f781b0275eac7d7c54468f0d5 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 10 Jul 2023 17:12:28 -0400 Subject: refs/packed-backend.c: implement jump lists to avoid excluded pattern(s) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When iterating through the `packed-refs` file in order to answer a query like: $ git for-each-ref --exclude=refs/__hidden__ it would be useful to avoid walking over all of the entries in `refs/__hidden__/*` when possible, since we know that the ref-filter code is going to throw them away anyways. In certain circumstances, doing so is possible. The algorithm for doing so is as follows: - For each excluded pattern, find the first record that matches it, and the first record that *doesn't* match it (i.e. the location you'd next want to consider when excluding that pattern). - Sort the set of excluded regions from the previous step in ascending order of the first location within the `packed-refs` file that matches. - Clean up the results from the previous step: discard empty regions, and combine adjacent regions. The set of regions which remains is referred to as the "jump list", and never contains any references which should be included in the result set. Then when iterating through the `packed-refs` file, if `iter->pos` is ever contained in one of the regions from the previous steps, advance `iter->pos` past the end of that region, and continue enumeration. Note that we only perform this optimization when none of the excluded pattern(s) have special meta-characters in them. For a pattern like "refs/foo[ac]", the excluded regions ("refs/fooa", "refs/fooc", and everything underneath them) are not connected. A future implementation that handles this case may split the character class (pretending as if two patterns were excluded: "refs/fooa", and "refs/fooc"). There are a few other gotchas worth considering. First, note that the jump list is sorted, so once we jump past a region, we can avoid considering it (or any regions preceding it) again. The member `jump_pos` is used to track the first next-possible region to jump through. Second, note that the jump list is best-effort, since we do not handle loose references, and because of the meta-character issue above. The jump list may not skip past all references which won't appear in the results, but will never skip over a reference which does appear in the result set. In repositories with a large number of hidden references, the speed-up can be significant. Tests here are done with a copy of linux.git with a reference "refs/pull/N" pointing at every commit, as in: $ git rev-list HEAD | awk '{ print "create refs/pull/" NR " " $0 }' | git update-ref --stdin $ git pack-refs --all , it is significantly faster to have `for-each-ref` jump over the excluded references, as opposed to filtering them out after the fact: $ hyperfine \ 'git for-each-ref --format="%(objectname) %(refname)" | grep -vE "^[0-9a-f]{40} refs/pull/"' \ 'git.prev for-each-ref --format="%(objectname) %(refname)" --exclude="refs/pull"' \ 'git.compile for-each-ref --format="%(objectname) %(refname)" --exclude="refs/pull"' Benchmark 1: git for-each-ref --format="%(objectname) %(refname)" | grep -vE "^[0-9a-f]{40} refs/pull/" Time (mean ± σ): 798.1 ms ± 3.3 ms [User: 687.6 ms, System: 146.4 ms] Range (min … max): 794.5 ms … 805.5 ms 10 runs Benchmark 2: git.prev for-each-ref --format="%(objectname) %(refname)" --exclude="refs/pull" Time (mean ± σ): 98.9 ms ± 1.4 ms [User: 93.1 ms, System: 5.7 ms] Range (min … max): 97.0 ms … 104.0 ms 29 runs Benchmark 3: git.compile for-each-ref --format="%(objectname) %(refname)" --exclude="refs/pull" Time (mean ± σ): 4.5 ms ± 0.2 ms [User: 0.7 ms, System: 3.8 ms] Range (min … max): 4.1 ms … 5.8 ms 524 runs Summary 'git.compile for-each-ref --format="%(objectname) %(refname)" --exclude="refs/pull"' ran 21.87 ± 1.05 times faster than 'git.prev for-each-ref --format="%(objectname) %(refname)" --exclude="refs/pull"' 176.52 ± 8.19 times faster than 'git for-each-ref --format="%(objectname) %(refname)" | grep -vE "^[0-9a-f]{40} refs/pull/"' (Comparing stock git and this patch isn't quite fair, since an earlier commit in this series adds a naive implementation of the `--exclude` option. `git.prev` is built from the previous commit and includes this naive implementation). Using the jump list is fairly straightforward (see the changes to `refs/packed-backend.c::next_record()`), but constructing the list is not. To ensure that the construction is correct, add a new suite of tests in t1419 covering various corner cases (overlapping regions, partially overlapping regions, adjacent regions, etc.). Co-authored-by: Jeff King Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ref-filter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'ref-filter.c') diff --git a/ref-filter.c b/ref-filter.c index c72d016bcc..28c1eb06be 100644 --- a/ref-filter.c +++ b/ref-filter.c @@ -2210,12 +2210,14 @@ static int for_each_fullref_in_pattern(struct ref_filter *filter, if (!filter->name_patterns[0]) { /* no patterns; we have to look at everything */ - return for_each_fullref_in("", cb, cb_data); + return refs_for_each_fullref_in(get_main_ref_store(the_repository), + "", filter->exclude.v, cb, cb_data); } return refs_for_each_fullref_in_prefixes(get_main_ref_store(the_repository), NULL, filter->name_patterns, - NULL, cb, cb_data); + filter->exclude.v, + cb, cb_data); } /* -- cgit v1.2.3