From a96015a517060e5b69c6dd428f7276f1078ba507 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Thu, 14 Dec 2023 17:23:45 -0500 Subject: pack-bitmap: plug leak in find_objects() The `find_objects()` function creates an object_list for any tips of the reachability query which do not have corresponding bitmaps. The object_list is not used outside of `find_objects()`, but we never free it with `object_list_free()`, resulting in a leak. Let's plug that leak by calling `object_list_free()`, which results in t6113 becoming leak-free. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/t6113-rev-list-bitmap-filters.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 't') diff --git a/t/t6113-rev-list-bitmap-filters.sh b/t/t6113-rev-list-bitmap-filters.sh index 86c70521f1..459f0d7412 100755 --- a/t/t6113-rev-list-bitmap-filters.sh +++ b/t/t6113-rev-list-bitmap-filters.sh @@ -4,6 +4,8 @@ test_description='rev-list combining bitmaps and filters' . ./test-lib.sh . "$TEST_DIRECTORY"/lib-bitmap.sh +TEST_PASSES_SANITIZE_LEAK=true + test_expect_success 'set up bitmapped repo' ' # one commit will have bitmaps, the other will not test_commit one && -- cgit v1.2.3 From 5f5ccd959573f88e126d53df16b149c64e6e9091 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Thu, 14 Dec 2023 17:23:51 -0500 Subject: midx: implement `BTMP` chunk When a multi-pack bitmap is used to implement verbatim pack reuse (that is, when verbatim chunks from an on-disk packfile are copied directly[^1]), it does so by using its "preferred pack" as the source for pack-reuse. This allows repositories to pack the majority of their objects into a single (often large) pack, and then use it as the single source for verbatim pack reuse. This increases the amount of objects that are reused verbatim (and consequently, decrease the amount of time it takes to generate many packs). But this performance comes at a cost, which is that the preferred packfile must pace its growth with that of the entire repository in order to maintain the utility of verbatim pack reuse. As repositories grow beyond what we can reasonably store in a single packfile, the utility of verbatim pack reuse diminishes. Or, at the very least, it becomes increasingly more expensive to maintain as the pack grows larger and larger. It would be beneficial to be able to perform this same optimization over multiple packs, provided some modest constraints (most importantly, that the set of packs eligible for verbatim reuse are disjoint with respect to the subset of their objects being sent). If we assume that the packs which we treat as candidates for verbatim reuse are disjoint with respect to any of their objects we may output, we need to make only modest modifications to the verbatim pack-reuse code itself. Most notably, we need to remove the assumption that the bits in the reachability bitmap corresponding to objects from the single reuse pack begin at the first bit position. Future patches will unwind these assumptions and reimplement their existing functionality as special cases of the more general assumptions (e.g. that reuse bits can start anywhere within the bitset, but happen to start at 0 for all existing cases). This patch does not yet relax any of those assumptions. Instead, it implements a foundational data-structure, the "Bitampped Packs" (`BTMP`) chunk of the multi-pack index. The `BTMP` chunk's contents are described in detail here. Importantly, the `BTMP` chunk contains information to map regions of a multi-pack index's reachability bitmap to the packs whose objects they represent. For now, this chunk is only written, not read (outside of the test-tool used in this patch to test the new chunk's behavior). Future patches will begin to make use of this new chunk. [^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the pack that wasn't used verbatim. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/helper/test-read-midx.c | 30 +++++++++++++++++++++++++++++- t/t5319-multi-pack-index.sh | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) (limited to 't') diff --git a/t/helper/test-read-midx.c b/t/helper/test-read-midx.c index e9a444ddba..e48557aba1 100644 --- a/t/helper/test-read-midx.c +++ b/t/helper/test-read-midx.c @@ -100,10 +100,36 @@ static int read_midx_preferred_pack(const char *object_dir) return 0; } +static int read_midx_bitmapped_packs(const char *object_dir) +{ + struct multi_pack_index *midx = NULL; + struct bitmapped_pack pack; + uint32_t i; + + setup_git_directory(); + + midx = load_multi_pack_index(object_dir, 1); + if (!midx) + return 1; + + for (i = 0; i < midx->num_packs; i++) { + if (nth_bitmapped_pack(the_repository, midx, &pack, i) < 0) + return 1; + + printf("%s\n", pack_basename(pack.p)); + printf(" bitmap_pos: %"PRIuMAX"\n", (uintmax_t)pack.bitmap_pos); + printf(" bitmap_nr: %"PRIuMAX"\n", (uintmax_t)pack.bitmap_nr); + } + + close_midx(midx); + + return 0; +} + int cmd__read_midx(int argc, const char **argv) { if (!(argc == 2 || argc == 3)) - usage("read-midx [--show-objects|--checksum|--preferred-pack] "); + usage("read-midx [--show-objects|--checksum|--preferred-pack|--bitmap] "); if (!strcmp(argv[1], "--show-objects")) return read_midx_file(argv[2], 1); @@ -111,5 +137,7 @@ int cmd__read_midx(int argc, const char **argv) return read_midx_checksum(argv[2]); else if (!strcmp(argv[1], "--preferred-pack")) return read_midx_preferred_pack(argv[2]); + else if (!strcmp(argv[1], "--bitmap")) + return read_midx_bitmapped_packs(argv[2]); return read_midx_file(argv[1], 0); } diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh index c20aafe99a..dd09134db0 100755 --- a/t/t5319-multi-pack-index.sh +++ b/t/t5319-multi-pack-index.sh @@ -1171,4 +1171,39 @@ test_expect_success 'reader notices out-of-bounds fanout' ' test_cmp expect err ' +test_expect_success 'bitmapped packs are stored via the BTMP chunk' ' + test_when_finished "rm -fr repo" && + git init repo && + ( + cd repo && + + for i in 1 2 3 4 5 + do + test_commit "$i" && + git repack -d || return 1 + done && + + find $objdir/pack -type f -name "*.idx" | xargs -n 1 basename | + sort >packs && + + git multi-pack-index write --stdin-packs err && + cat >expect <<-\EOF && + error: MIDX does not contain the BTMP chunk + EOF + test_cmp expect err && + + git multi-pack-index write --stdin-packs --bitmap \ + --preferred-pack="$(head -n1 actual && + for i in $(test_seq $(wc -l expect && + test_cmp expect actual + ) +' + test_done -- cgit v1.2.3 From b1e3333068247ddd44021a0b69457c249ddee7a1 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Thu, 14 Dec 2023 17:24:25 -0500 Subject: midx: implement `midx_preferred_pack()` When performing a binary search over the objects in a MIDX's bitmap (i.e. in pseudo-pack order), the reader reconstructs the pseudo-pack ordering using a combination of (a) the preferred pack, (b) the pack's lexical position in the MIDX based on pack names, and (c) the object offset within the pack. In order to perform this binary search, the reader must know the identity of the preferred pack. This could be stored in the MIDX, but isn't for historical reasons, mostly because it can easily be inferred at read-time by looking at the object in the first bit position and finding out which pack it was selected from in the MIDX, like so: nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0)); In midx_to_pack_pos() which performs this binary search, we look up the identity of the preferred pack before each search. This is relatively quick, since it involves two table-driven lookups (one in the MIDX's revindex for `pack_pos_to_midx()`, and another in the MIDX's object table for `nth_midxed_pack_int_id()`). But since the preferred pack does not change after the MIDX is written, it is safe to cache this value on the MIDX itself. Write a helper to do just that, and rewrite all of the existing call-sites that care about the identity of the preferred pack in terms of this new helper. This will prepare us for a subsequent patch where we will need to binary search through the MIDX's pseudo-pack order multiple times. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/helper/test-read-midx.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 't') diff --git a/t/helper/test-read-midx.c b/t/helper/test-read-midx.c index e48557aba1..4acae41bb9 100644 --- a/t/helper/test-read-midx.c +++ b/t/helper/test-read-midx.c @@ -6,6 +6,7 @@ #include "pack-bitmap.h" #include "packfile.h" #include "setup.h" +#include "gettext.h" static int read_midx_file(const char *object_dir, int show_objects) { @@ -79,7 +80,7 @@ static int read_midx_checksum(const char *object_dir) static int read_midx_preferred_pack(const char *object_dir) { struct multi_pack_index *midx = NULL; - struct bitmap_index *bitmap = NULL; + uint32_t preferred_pack; setup_git_directory(); @@ -87,16 +88,12 @@ static int read_midx_preferred_pack(const char *object_dir) if (!midx) return 1; - bitmap = prepare_bitmap_git(the_repository); - if (!bitmap) - return 1; - if (!bitmap_is_midx(bitmap)) { - free_bitmap_index(bitmap); + if (midx_preferred_pack(midx, &preferred_pack) < 0) { + warning(_("could not determine MIDX preferred pack")); return 1; } - printf("%s\n", midx->pack_names[midx_preferred_pack(bitmap)]); - free_bitmap_index(bitmap); + printf("%s\n", midx->pack_names[preferred_pack]); return 0; } -- cgit v1.2.3 From 3bea0c0611f1539b552c8255cffbe27932d0582f Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Thu, 14 Dec 2023 17:24:39 -0500 Subject: t/test-lib-functions.sh: implement `test_trace2_data` helper Introduce a helper function which looks for a specific (category, key, value) tuple in the output of a trace2 event stream. We will use this function in a future patch to ensure that the expected number of objects are reused from an expected number of packs. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/test-lib-functions.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 't') diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh index 9c3cf12b26..93fe819b0a 100644 --- a/t/test-lib-functions.sh +++ b/t/test-lib-functions.sh @@ -1874,6 +1874,20 @@ test_region () { return 0 } +# Check that the given data fragment was included as part of the +# trace2-format trace on stdin. +# +# test_trace2_data +# +# For example, to look for trace2_data_intmax("pack-objects", repo, +# "reused", N) in an invocation of "git pack-objects", run: +# +# GIT_TRACE2_EVENT="$(pwd)/trace.txt" git pack-objects ... && +# test_trace2_data pack-objects reused N Date: Thu, 14 Dec 2023 17:24:44 -0500 Subject: pack-bitmap: enable reuse from all bitmapped packs Now that both the pack-bitmap and pack-objects code are prepared to handle marking and using objects from multiple bitmapped packs for verbatim reuse, allow marking objects from all bitmapped packs as eligible for reuse. Within the `reuse_partial_packfile_from_bitmap()` function, we no longer only mark the pack whose first object is at bit position zero for reuse, and instead mark any pack contained in the MIDX as a reuse candidate. Provide a handful of test cases in a new script (t5332) exercising interesting behavior for multi-pack reuse to ensure that we performed all of the previous steps correctly. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/t5332-multi-pack-reuse.sh | 203 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100755 t/t5332-multi-pack-reuse.sh (limited to 't') diff --git a/t/t5332-multi-pack-reuse.sh b/t/t5332-multi-pack-reuse.sh new file mode 100755 index 0000000000..2ba788b042 --- /dev/null +++ b/t/t5332-multi-pack-reuse.sh @@ -0,0 +1,203 @@ +#!/bin/sh + +test_description='pack-objects multi-pack reuse' + +. ./test-lib.sh +. "$TEST_DIRECTORY"/lib-bitmap.sh + +objdir=.git/objects +packdir=$objdir/pack + +test_pack_reused () { + test_trace2_data pack-objects pack-reused "$1" +} + +test_packs_reused () { + test_trace2_data pack-objects packs-reused "$1" +} + + +# pack_position objects && + grep "$1" objects | cut -d" " -f1 +} + +test_expect_success 'preferred pack is reused for single-pack reuse' ' + test_config pack.allowPackReuse single && + + for i in A B + do + test_commit "$i" && + git repack -d || return 1 + done && + + git multi-pack-index write --bitmap && + + : >trace2.txt && + GIT_TRACE2_EVENT="$PWD/trace2.txt" \ + git pack-objects --stdout --revs --all >/dev/null && + + test_pack_reused 3 in <<-EOF && + $(git rev-parse C) + ^$(git rev-parse A) + EOF + + : >trace2.txt && + GIT_TRACE2_EVENT="$PWD/trace2.txt" \ + git pack-objects --stdout --revs /dev/null && + + test_pack_reused 6 trace2.txt && + GIT_TRACE2_EVENT="$PWD/trace2.txt" \ + git pack-objects --stdout --revs --all >/dev/null && + + test_pack_reused 9 in <<-EOF && + $(git rev-parse E) + ^$(git rev-parse D) + EOF + + : >trace2.txt && + GIT_TRACE2_EVENT="$PWD/trace2.txt" \ + git pack-objects --stdout --delta-base-offset --revs /dev/null && + + test_pack_reused 3 in <<-EOF && + $(git rev-parse E) + ^$(git rev-parse D) + EOF + + : >trace2.txt && + GIT_TRACE2_EVENT="$PWD/trace2.txt" \ + git pack-objects --stdout --delta-base-offset --revs /dev/null && + + test_pack_reused 3 f && + git add f && + test_tick && + git commit -m "delta" && + delta="$(git rev-parse HEAD)" && + + test_seq 64 >f && + test_tick && + git commit -a -m "base" && + base="$(git rev-parse HEAD)" && + + test_commit other && + + git repack -d && + + have_delta "$(git rev-parse $delta:f)" "$(git rev-parse $base:f)" && + + git multi-pack-index write --bitmap && + + cat >in <<-EOF && + $(git rev-parse other) + ^$base + EOF + + : >trace2.txt && + GIT_TRACE2_EVENT="$PWD/trace2.txt" \ + git pack-objects --stdout --delta-base-offset --revs /dev/null && + + # We can only reuse the 3 objects corresponding to "other" from + # the latest pack. + # + # This is because even though we want "delta", we do not want + # "base", meaning that we have to inflate the delta/base-pair + # corresponding to the blob in commit "delta", which bypasses + # the pack-reuse mechanism. + # + # The remaining objects from the other pack are similarly not + # reused because their objects are on the uninteresting side of + # the query. + test_pack_reused 3 in <<-EOF && + $(git rev-parse $base) + ^$(git rev-parse $delta) + EOF + + P="$(git pack-objects --revs $packdir/pack trace2.txt && + GIT_TRACE2_EVENT="$PWD/trace2.txt" \ + git pack-objects --stdout --delta-base-offset --all >/dev/null && + + packs_nr="$(find $packdir -type f -name "pack-*.pack" | wc -l)" && + objects_nr="$(git rev-list --count --all --objects)" && + + test_pack_reused $(($objects_nr - 1)) Date: Thu, 14 Dec 2023 17:24:47 -0500 Subject: t/perf: add performance tests for multi-pack reuse To ensure that we don't regress either the size or runtime performance of multi-pack reuse, add a performance test to measure both of these. The test partitions the objects in GIT_TEST_PERF_LARGE_REPO into 1, 10, and 100 packs, and then tries to perform a "clone" at each stage with both single- and multi-pack reuse enabled. Note that the `repack_into_n_chunks()` function in this new test script differs from the existing `repack_into_n()`. The former partitions the repository into N equal-sized chunks, while the latter produces N packs of five commits each (plus their objects), and then another pack with the remainder. On git.git, I can produce the following results on my machine: Test this tree -------------------------------------------------------------------------------- 5332.3: clone for 1-pack scenario (single-pack reuse) 1.57(2.99+0.15) 5332.4: clone size for 1-pack scenario (single-pack reuse) 231.8M 5332.5: clone for 1-pack scenario (multi-pack reuse) 1.79(2.96+0.21) 5332.6: clone size for 1-pack scenario (multi-pack reuse) 231.7M 5332.9: clone for 10-pack scenario (single-pack reuse) 3.89(16.75+0.35) 5332.10: clone size for 10-pack scenario (single-pack reuse) 209.9M 5332.11: clone for 10-pack scenario (multi-pack reuse) 1.56(2.99+0.17) 5332.12: clone size for 10-pack scenario (multi-pack reuse) 224.4M 5332.15: clone for 100-pack scenario (single-pack reuse) 8.24(54.31+0.59) 5332.16: clone size for 100-pack scenario (single-pack reuse) 278.3M 5332.17: clone for 100-pack scenario (multi-pack reuse) 2.13(2.44+0.33) 5332.18: clone size for 100-pack scenario (multi-pack reuse) 357.9M Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/perf/p5332-multi-pack-reuse.sh | 81 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100755 t/perf/p5332-multi-pack-reuse.sh (limited to 't') diff --git a/t/perf/p5332-multi-pack-reuse.sh b/t/perf/p5332-multi-pack-reuse.sh new file mode 100755 index 0000000000..5c6c575d62 --- /dev/null +++ b/t/perf/p5332-multi-pack-reuse.sh @@ -0,0 +1,81 @@ +#!/bin/sh + +test_description='tests pack performance with multi-pack reuse' + +. ./perf-lib.sh +. "${TEST_DIRECTORY}/perf/lib-pack.sh" + +packdir=.git/objects/pack + +test_perf_large_repo + +find_pack () { + for idx in $packdir/pack-*.idx + do + if git show-index <$idx | grep -q "$1" + then + basename $idx + fi || return 1 + done +} + +repack_into_n_chunks () { + git repack -adk && + + test "$1" -eq 1 && return || + + find $packdir -type f | sort >packs.before && + + # partition the repository into $1 chunks of consecutive commits, and + # then create $1 packs with the objects reachable from each chunk + # (excluding any objects reachable from the previous chunks) + sz="$(($(git rev-list --count --all) / $1))" + for rev in $(git rev-list --all | awk "NR % $sz == 0" | tac) + do + pack="$(echo "$rev" | git pack-objects --revs \ + --honor-pack-keep --delta-base-offset $packdir/pack)" && + touch $packdir/pack-$pack.keep || return 1 + done + + # grab any remaining objects not packed by the previous step(s) + git pack-objects --revs --all --honor-pack-keep --delta-base-offset \ + $packdir/pack && + + find $packdir -type f | sort >packs.after && + + # and install the whole thing + for f in $(comm -12 packs.before packs.after) + do + rm -f "$f" || return 1 + done + rm -fr $packdir/*.keep +} + +for nr_packs in 1 10 100 +do + test_expect_success "create $nr_packs-pack scenario" ' + repack_into_n_chunks $nr_packs + ' + + test_expect_success "setup bitmaps for $nr_packs-pack scenario" ' + find $packdir -type f -name "*.idx" | sed -e "s/.*\/\(.*\)$/+\1/g" | + git multi-pack-index write --stdin-packs --bitmap \ + --preferred-pack="$(find_pack $(git rev-parse HEAD))" + ' + + for reuse in single multi + do + test_perf "clone for $nr_packs-pack scenario ($reuse-pack reuse)" " + git for-each-ref --format='%(objectname)' refs/heads refs/tags >in && + git -c pack.allowPackReuse=$reuse pack-objects \ + --revs --delta-base-offset --use-bitmap-index \ + --stdout result + " + + test_size "clone size for $nr_packs-pack scenario ($reuse-pack reuse)" ' + wc -c