From 49abcd21da65f12b4ae873433e5f6220bfaae1da Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 20 Mar 2023 11:26:54 +0000 Subject: for-each-ref: add ahead-behind format atom The previous change implemented the ahead_behind() method, including an algorithm to compute the ahead/behind values for a number of commit tips relative to a number of commit bases. Now, integrate that algorithm as part of 'git for-each-ref' hidden behind a new format atom, ahead-behind. This naturally extends to 'git branch' and 'git tag' builtins, as well. This format allows specifying multiple bases, if so desired, and all matching references are compared against all of those bases. For this reason, failing to read a reference provided from these atoms results in an error. In order to translate the ahead_behind() method information to the format output code in ref-filter.c, we must populate arrays of ahead_behind_count structs. In struct ref_array, we store the full array that will be passed to ahead_behind(). In struct ref_array_item, we store an array of pointers that point to the relvant items within the full array. In this way, we can pull all relevant ahead/behind values directly when formatting output for a specific item. It also ensures the lifetime of the ahead_behind_count structs matches the time that the array is being used. Add specific tests of the ahead/behind counts in t6600-test-reach.sh, as it has an interesting repository shape. In particular, its merging strategy and its use of different commit-graphs would demonstrate over- counting if the ahead_behind() method did not already account for that possibility. Also add tests for the specific for-each-ref, branch, and tag builtins. In the case of 'git tag', there are intersting cases that happen when some of the selected tips are not commits. This requires careful logic around commits_nr in the second loop of filter_ahead_behind(). Also, the test in t7004 is carefully located to avoid being dependent on the GPG prereq. It also avoids using the test_commit helper, as that will add ticks to the time and disrupt the expected timestamps in later tag tests. Also add performance tests in a new p1300-graph-walks.sh script. This will be useful for more uses in the future, but for now compare the ahead-behind counting algorithm in 'git for-each-ref' to the naive implementation by running 'git rev-list --count' processes for each input. For the Git source code repository, the improvement is already obvious: Test this tree --------------------------------------------------------------- 1500.2: ahead-behind counts: git for-each-ref 0.07(0.07+0.00) 1500.3: ahead-behind counts: git branch 0.07(0.06+0.00) 1500.4: ahead-behind counts: git tag 0.07(0.06+0.00) 1500.5: ahead-behind counts: git rev-list 1.32(1.04+0.27) But the standard performance benchmark is the Linux kernel repository, which demosntrates a significant improvement: Test this tree --------------------------------------------------------------- 1500.2: ahead-behind counts: git for-each-ref 0.27(0.24+0.02) 1500.3: ahead-behind counts: git branch 0.27(0.24+0.03) 1500.4: ahead-behind counts: git tag 0.28(0.27+0.01) 1500.5: ahead-behind counts: git rev-list 4.57(4.03+0.54) The 'git rev-list' test exists in this change as a demonstration, but it will be removed in the next change to avoid wasting time on this comparison. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- ref-filter.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'ref-filter.c') diff --git a/ref-filter.c b/ref-filter.c index f8203c6b05..62135f649e 100644 --- a/ref-filter.c +++ b/ref-filter.c @@ -158,6 +158,7 @@ enum atom_type { ATOM_THEN, ATOM_ELSE, ATOM_REST, + ATOM_AHEADBEHIND, }; /* @@ -586,6 +587,22 @@ static int rest_atom_parser(struct ref_format *format, struct used_atom *atom, return 0; } +static int ahead_behind_atom_parser(struct ref_format *format, struct used_atom *atom, + const char *arg, struct strbuf *err) +{ + struct string_list_item *item; + + if (!arg) + return strbuf_addf_ret(err, -1, _("expected format: %%(ahead-behind:)")); + + item = string_list_append(&format->bases, arg); + item->util = lookup_commit_reference_by_name(arg); + if (!item->util) + die("failed to find '%s'", arg); + + return 0; +} + static int head_atom_parser(struct ref_format *format, struct used_atom *atom, const char *arg, struct strbuf *err) { @@ -645,6 +662,7 @@ static struct { [ATOM_THEN] = { "then", SOURCE_NONE }, [ATOM_ELSE] = { "else", SOURCE_NONE }, [ATOM_REST] = { "rest", SOURCE_NONE, FIELD_STR, rest_atom_parser }, + [ATOM_AHEADBEHIND] = { "ahead-behind", SOURCE_OTHER, FIELD_STR, ahead_behind_atom_parser }, /* * Please update $__git_ref_fieldlist in git-completion.bash * when you add new atoms @@ -1848,6 +1866,7 @@ static int populate_value(struct ref_array_item *ref, struct strbuf *err) struct object *obj; int i; struct object_info empty = OBJECT_INFO_INIT; + int ahead_behind_atoms = 0; CALLOC_ARRAY(ref->value, used_atom_cnt); @@ -1978,6 +1997,16 @@ static int populate_value(struct ref_array_item *ref, struct strbuf *err) else v->s = xstrdup(""); continue; + } else if (atom_type == ATOM_AHEADBEHIND) { + if (ref->counts) { + const struct ahead_behind_count *count; + count = ref->counts[ahead_behind_atoms++]; + v->s = xstrfmt("%d %d", count->ahead, count->behind); + } else { + /* Not a commit. */ + v->s = xstrdup(""); + } + continue; } else continue; @@ -2328,6 +2357,7 @@ static void free_array_item(struct ref_array_item *item) free((char *)item->value[i].s); free(item->value); } + free(item->counts); free(item); } @@ -2356,6 +2386,8 @@ void ref_array_clear(struct ref_array *array) free_worktrees(ref_to_worktree_map.worktrees); ref_to_worktree_map.worktrees = NULL; } + + FREE_AND_NULL(array->counts); } #define EXCLUDE_REACHED 0 @@ -2418,6 +2450,47 @@ static void reach_filter(struct ref_array *array, free(to_clear); } +void filter_ahead_behind(struct repository *r, + struct ref_format *format, + struct ref_array *array) +{ + struct commit **commits; + size_t commits_nr = format->bases.nr + array->nr; + + if (!format->bases.nr || !array->nr) + return; + + ALLOC_ARRAY(commits, commits_nr); + for (size_t i = 0; i < format->bases.nr; i++) + commits[i] = format->bases.items[i].util; + + ALLOC_ARRAY(array->counts, st_mult(format->bases.nr, array->nr)); + + commits_nr = format->bases.nr; + array->counts_nr = 0; + for (size_t i = 0; i < array->nr; i++) { + const char *name = array->items[i]->refname; + commits[commits_nr] = lookup_commit_reference_by_name(name); + + if (!commits[commits_nr]) + continue; + + CALLOC_ARRAY(array->items[i]->counts, format->bases.nr); + for (size_t j = 0; j < format->bases.nr; j++) { + struct ahead_behind_count *count; + count = &array->counts[array->counts_nr++]; + count->tip_index = commits_nr; + count->base_index = j; + + array->items[i]->counts[j] = count; + } + commits_nr++; + } + + ahead_behind(r, commits, commits_nr, array->counts, array->counts_nr); + free(commits); +} + /* * API for filtering a set of refs. Based on the type of refs the user * has requested, we iterate through those refs and apply filters -- cgit v1.2.3 From cbfe360b140fe92d9c4a763bf630c3b8ba431522 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 20 Mar 2023 11:26:55 +0000 Subject: commit-reach: add tips_reachable_from_bases() Both 'git for-each-ref --merged=' and 'git branch --merged=' use the ref-filter machinery to select references or branches (respectively) that are reachable from a set of commits presented by one or more --merged arguments. This happens within reach_filter(), which uses the revision-walk machinery to walk history in a standard way. However, the commit-reach.c file is full of custom searches that are more efficient, especially for reachability queries that can terminate early when reachability is discovered. Add a new tips_reachable_from_bases() method to commit-reach.c and call it from within reach_filter() in ref-filter.c. This affects both 'git branch' and 'git for-each-ref' as tested in p1500-graph-walks.sh. For the Linux kernel repository, we take an already-fast algorithm and make it even faster: Test HEAD~1 HEAD ------------------------------------------------------------------- 1500.5: contains: git for-each-ref --merged 0.13 0.02 -84.6% 1500.6: contains: git branch --merged 0.14 0.02 -85.7% 1500.7: contains: git tag --merged 0.15 0.03 -80.0% (Note that we remove the iterative 'git rev-list' test from p1500 because it no longer makes sense as a comparison to 'git for-each-ref' and would just waste time running it for these comparisons.) The algorithm is implemented in commit-reach.c in the method tips_reachable_from_base(). This method takes a string_list of tips and assigns the 'util' for each item with the value 1 if the base commit can reach those tips. Like other reachability queries in commit-reach.c, the fastest way to search for "can A reach B?" is to do a depth-first search up to the generation number of B, preferring to explore first parents before later parents. While we must walk all reachable commits up to that generation number when the answer is "no", the depth-first search can answer "yes" much faster than other approaches in most cases. This search becomes trickier when there are multiple targets for the depth-first search. The commits with lower generation number are more likely to be within the history of the start commit, but we don't want to waste time searching commits of low generation number if the commit target with lowest generation number has already been found. The trick here is to take the input commits and sort them by generation number in ascending order. Track the index within this order as min_generation_index. When we find a commit, if its index in the list is equal to min_generation_index, then we can increase the generation number boundary of our search to the next-lowest value in the list. With this mechanism, the number of commits to search is minimized with respect to the depth-first search heuristic. We will walk all commits up to the minimum generation number of a commit that is _not_ reachable from the start, but we will walk only the necessary portion of the depth-first search for the reachable commits of lower generation. Add extra tests for this behavior in t6600-test-reach.sh as the interesting data shape of that repository can sometimes demonstrate corner case bugs. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- ref-filter.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'ref-filter.c') diff --git a/ref-filter.c b/ref-filter.c index 62135f649e..c724ff9411 100644 --- a/ref-filter.c +++ b/ref-filter.c @@ -2396,33 +2396,22 @@ static void reach_filter(struct ref_array *array, struct commit_list *check_reachable, int include_reached) { - struct rev_info revs; int i, old_nr; struct commit **to_clear; - struct commit_list *cr; if (!check_reachable) return; CALLOC_ARRAY(to_clear, array->nr); - - repo_init_revisions(the_repository, &revs, NULL); - for (i = 0; i < array->nr; i++) { struct ref_array_item *item = array->items[i]; - add_pending_object(&revs, &item->commit->object, item->refname); to_clear[i] = item->commit; } - for (cr = check_reachable; cr; cr = cr->next) { - struct commit *merge_commit = cr->item; - merge_commit->object.flags |= UNINTERESTING; - add_pending_object(&revs, &merge_commit->object, ""); - } - - revs.limited = 1; - if (prepare_revision_walk(&revs)) - die(_("revision walk setup failed")); + tips_reachable_from_bases(the_repository, + check_reachable, + to_clear, array->nr, + UNINTERESTING); old_nr = array->nr; array->nr = 0; @@ -2446,7 +2435,6 @@ static void reach_filter(struct ref_array *array, clear_commit_marks(merge_commit, ALL_REV_FLAGS); } - release_revisions(&revs); free(to_clear); } -- cgit v1.2.3