diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2023-03-28 17:32:14 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2023-03-28 17:32:14 +0300 |
commit | d1c0a77c4b8df1bd075488d89c4c13f62f00f471 (patch) | |
tree | 0d86ba6f89d3e0229d406d6cd02d7ec03143620a | |
parent | c0b74cb617177ef15eaa91f566d71e113e4ca393 (diff) |
git/housekeeping: Implement support for geometric repacking
Ideally, objects in repositories would only ever be contained in a
single packfile so that they can be efficiently searched and served
during normal operations. Unfortunately, writing packfiles becomes more
and more expensive the larger the repository becomes. As a consequence,
packing objects needs to be based on a compromise between keeping the
number of overall packfiles low and keeping the overhead of the repack
itself low.
The current approach implemented by Gitaly is to accept a certain number
of packfiles that scales with the repository size: the larger the repo,
the more packfiles we accept. But once the threshold has been reached,
we perform a full repack that moves all objects into a single packfile.
This heuristic is better than what we had a year ago, but it is still
suboptimal in the context of large monorepositories where the number of
packfiles will grow very fast. We thus still end up repacking such
repositories quite frequently.
To help with this exact scenario, git-repack(1) has gained support for
geometric repacking in Git v2.32.0. With this mode, git-repack(1) will
make sure that packfiles form a geometric sequence: the next-larger
packfile must have at least factor `r` times as many objects as the
previous packfile. If this condition is violated, Git will pick the
smallest set of packfiles that need to be merged in order to restore the
geometric sequence. Like this, we typically only have to merge a much
smaller set of objects into a single packfile compared to the previous
mode where we had to perform a full repack regularly.
Gitaly will soon start to make use of geometric repacking as well. As a
preparatory step, implement the infrastructure in the form of a new
repacking strategy and a bunch of tests to nail down that the new
command really behaves as we expect it to behave.
The new infrastructure is not yet wired up and will thus not get hit via
production traffic.
Changelog: added
-rw-r--r-- | internal/git/housekeeping/objects.go | 39 | ||||
-rw-r--r-- | internal/git/housekeeping/objects_test.go | 212 | ||||
-rw-r--r-- | internal/git/housekeeping/testhelper_test.go | 2 |
3 files changed, 253 insertions, 0 deletions
diff --git a/internal/git/housekeeping/objects.go b/internal/git/housekeeping/objects.go index 103573d1c..7202b2c8b 100644 --- a/internal/git/housekeeping/objects.go +++ b/internal/git/housekeeping/objects.go @@ -39,6 +39,11 @@ const ( // objects into a new packfile. Unreachable objects will be written into a separate cruft // packfile. RepackObjectsStrategyFullWithCruft + // RepackObjectsStrategyGeometric performs an geometric repack. This strategy will repack + // packfiles so that the resulting pack structure forms a geometric sequence in the number + // of objects. Loose objects will get soaked up as part of the repack regardless of their + // reachability. + RepackObjectsStrategyGeometric ) // RepackObjectsConfig is configuration for RepackObjects. @@ -95,6 +100,40 @@ func RepackObjects(ctx context.Context, repo *localrepo.Repo, cfg RepackObjectsC Value: cfg.CruftExpireBefore.Format(rfc2822DateFormat), }) } + case RepackObjectsStrategyGeometric: + options = []git.Option{ + // We use a geometric factor `r`, which means that every successively larger + // packfile must have at least `r` times the number of objects. + // + // This factor ultimately determines how many packfiles there can be at a + // maximum in a repository for a given number of objects. The maximum number + // of objects with `n` packfiles and a factor `r` is `(1 - r^n) / (1 - r)`. + // E.g. with a factor of 4 and 10 packfiles, we can have at most 349,525 + // objects, with 16 packfiles we can have 1,431,655,765 objects. Contrary to + // that, having a factor of 2 will translate to 1023 objects at 10 packfiles + // and 65535 objects at 16 packfiles at a maximum. + // + // So what we're effectively choosing here is how often we need to repack + // larger parts of the repository. The higher the factor the more we'll have + // to repack as the packfiles will be larger. On the other hand, having a + // smaller factor means we'll have to repack less objects as the slices we + // need to repack will have less objects. + // + // The end result is a hybrid approach between incremental repacks and full + // repacks: we won't typically repack the full repository, but only a subset + // of packfiles. + // + // For now, we choose a geometric factor of two. Large repositories nowadays + // typically have a few million objects, which would boil down to having at + // most 32 packfiles in the repository. This number is not scientifically + // chosen though any may be changed at a later point in time. + git.ValueFlag{Name: "--geometric", Value: "2"}, + // Make sure to delete loose objects and packfiles that are made obsolete + // by the new packfile. + git.Flag{Name: "-d"}, + // Don't include objects part of an alternate. + git.Flag{Name: "-l"}, + } default: return structerr.NewInvalidArgument("invalid strategy %d", cfg.Strategy) } diff --git a/internal/git/housekeeping/objects_test.go b/internal/git/housekeeping/objects_test.go index 2bc44dfcf..36deaadd6 100644 --- a/internal/git/housekeeping/objects_test.go +++ b/internal/git/housekeeping/objects_test.go @@ -1,7 +1,9 @@ package housekeeping import ( + "os" "path/filepath" + "strings" "testing" "time" @@ -9,6 +11,7 @@ import ( "gitlab.com/gitlab-org/gitaly/v15/internal/git/gittest" "gitlab.com/gitlab-org/gitaly/v15/internal/git/localrepo" "gitlab.com/gitlab-org/gitaly/v15/internal/git/stats" + "gitlab.com/gitlab-org/gitaly/v15/internal/helper/perm" "gitlab.com/gitlab-org/gitaly/v15/internal/structerr" "gitlab.com/gitlab-org/gitaly/v15/internal/testhelper" "gitlab.com/gitlab-org/gitaly/v15/internal/testhelper/testcfg" @@ -389,6 +392,215 @@ func TestRepackObjects(t *testing.T) { packfiles: 1, }, }, + { + desc: "geometric repack with reachable object", + setup: func(t *testing.T, repoPath string) { + gittest.WriteCommit(t, cfg, repoPath, gittest.WithBranch("main"), gittest.WithMessage("unreachable")) + }, + repackCfg: RepackObjectsConfig{ + Strategy: RepackObjectsStrategyGeometric, + }, + stateBeforeRepack: objectsState{ + looseObjects: 2, + }, + stateAfterRepack: objectsState{ + packfiles: 1, + }, + }, + { + desc: "geometric repack soaks up unreachable objects", + setup: func(t *testing.T, repoPath string) { + gittest.WriteBlob(t, cfg, repoPath, []byte("unreachable blob")) + }, + repackCfg: RepackObjectsConfig{ + Strategy: RepackObjectsStrategyGeometric, + }, + stateBeforeRepack: objectsState{ + looseObjects: 1, + }, + stateAfterRepack: objectsState{ + packfiles: 1, + }, + }, + { + desc: "geometric repack leaves cruft pack alone", + setup: func(t *testing.T, repoPath string) { + gittest.WriteBlob(t, cfg, repoPath, []byte("unreachable blob")) + gittest.Exec(t, cfg, "-C", repoPath, "repack", "--cruft", "-d", "-n", "--no-write-bitmap-index") + }, + repackCfg: RepackObjectsConfig{ + Strategy: RepackObjectsStrategyGeometric, + }, + stateBeforeRepack: objectsState{ + packfiles: 1, + cruftPacks: 1, + }, + stateAfterRepack: objectsState{ + packfiles: 1, + cruftPacks: 1, + }, + }, + { + desc: "geometric repack leaves keep pack alone", + setup: func(t *testing.T, repoPath string) { + gittest.WriteCommit(t, cfg, repoPath, gittest.WithBranch("main")) + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "-n", "--no-write-bitmap-index") + + packPath, err := filepath.Glob(filepath.Join(repoPath, "objects", "pack", "pack-*.pack")) + require.NoError(t, err) + require.Len(t, packPath, 1) + + keepPath := strings.TrimSuffix(packPath[0], ".pack") + ".keep" + require.NoError(t, os.WriteFile(keepPath, nil, perm.PrivateFile)) + }, + repackCfg: RepackObjectsConfig{ + Strategy: RepackObjectsStrategyGeometric, + }, + stateBeforeRepack: objectsState{ + packfiles: 1, + keepPacks: 1, + }, + stateAfterRepack: objectsState{ + packfiles: 1, + keepPacks: 1, + }, + }, + { + desc: "geometric repack keeps valid geometric sequence", + setup: func(t *testing.T, repoPath string) { + // Write a commit that in total contains 3 objects: 1 commit, 1 tree + // and 1 blob. + gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "a", Mode: "100644", Content: "a"}, + ), gittest.WithBranch("main")) + + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "-n", "--no-write-bitmap-index") + + // Now we write another blob. As the previous packfile contains 3x + // the amount of objects we should just create a new packfile + // instead of merging them. + gittest.WriteBlob(t, cfg, repoPath, []byte("new blob")) + }, + repackCfg: RepackObjectsConfig{ + Strategy: RepackObjectsStrategyGeometric, + }, + stateBeforeRepack: objectsState{ + packfiles: 1, + looseObjects: 1, + }, + stateAfterRepack: objectsState{ + packfiles: 2, + }, + }, + { + desc: "geometric repack keeps preexisting pack when new pack violates sequence", + setup: func(t *testing.T, repoPath string) { + // Write a commit that in total contains 3 objects: 1 commit, 1 tree + // and 1 blob. + gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "a", Mode: "100644", Content: "a"}, + ), gittest.WithBranch("main")) + + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "-n", "--no-write-bitmap-index") + + // Now we write two additional blobs. Even though the newly written + // packfile will cause us to invalidate the geometric sequence, we + // still create it without merging them. This _seems_ to be on + // purpose: packfiles will only be merged when preexisting packs + // validate the sequence. + gittest.WriteBlob(t, cfg, repoPath, []byte("one")) + gittest.WriteBlob(t, cfg, repoPath, []byte("two")) + }, + repackCfg: RepackObjectsConfig{ + Strategy: RepackObjectsStrategyGeometric, + }, + stateBeforeRepack: objectsState{ + packfiles: 1, + looseObjects: 2, + }, + stateAfterRepack: objectsState{ + packfiles: 2, + }, + }, + { + desc: "geometric repack repacks when geometric sequence is invalidated", + setup: func(t *testing.T, repoPath string) { + // Write a commit that in total contains 3 objects: 1 commit, 1 tree + // and 1 blob. + gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "a", Mode: "100644", Content: "a"}, + ), gittest.WithBranch("main")) + + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "-n", "--no-write-bitmap-index") + + // Write a second set of objects that is reachable so that we can + // create a second packfile easily. + gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "1", Mode: "100644", Content: "1"}, + ), gittest.WithBranch("main")) + // Do an incremental repack now. We thus have two packfiles that + // invalidate the geometric sequence. + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-d", "-n", "--no-write-bitmap-index") + }, + repackCfg: RepackObjectsConfig{ + Strategy: RepackObjectsStrategyGeometric, + }, + stateBeforeRepack: objectsState{ + packfiles: 2, + }, + stateAfterRepack: objectsState{ + packfiles: 1, + }, + }, + { + desc: "geometric repack consolidates multiple packs into one", + setup: func(t *testing.T, repoPath string) { + // Write a commit that in total contains 3 objects: 1 commit, 1 tree + // and 1 blob. + gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "a", Mode: "100644", Content: "a"}, + ), gittest.WithBranch("main")) + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-d", "-n", "--no-write-bitmap-index") + + // Write another commit with three new objects, so that we now have + // two packfiles with three objects each. + gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "b", Mode: "100644", Content: "b"}, + ), gittest.WithBranch("main")) + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-d", "-n", "--no-write-bitmap-index") + + // Write a third commit with 12 new objects. + gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "01", Mode: "100644", Content: "01"}, + gittest.TreeEntry{Path: "02", Mode: "100644", Content: "02"}, + gittest.TreeEntry{Path: "03", Mode: "100644", Content: "03"}, + gittest.TreeEntry{Path: "04", Mode: "100644", Content: "04"}, + gittest.TreeEntry{Path: "05", Mode: "100644", Content: "05"}, + gittest.TreeEntry{Path: "06", Mode: "100644", Content: "06"}, + gittest.TreeEntry{Path: "07", Mode: "100644", Content: "07"}, + gittest.TreeEntry{Path: "08", Mode: "100644", Content: "08"}, + gittest.TreeEntry{Path: "09", Mode: "100644", Content: "09"}, + gittest.TreeEntry{Path: "10", Mode: "100644", Content: "10"}, + ), gittest.WithBranch("main")) + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-d", "-n", "--no-write-bitmap-index") + + // We now have three packfiles: two with three objects respectively, + // and one with 12 objects. The first two packfiles do not form a + // geometric sequence, but if they are packed together they result + // in a packfile with 6 objects, which would restore the sequence. + // So what we want to see is that we start with three, but end with + // 2 packfiles. + }, + repackCfg: RepackObjectsConfig{ + Strategy: RepackObjectsStrategyGeometric, + }, + stateBeforeRepack: objectsState{ + packfiles: 3, + }, + stateAfterRepack: objectsState{ + packfiles: 2, + }, + }, } { tc := tc diff --git a/internal/git/housekeeping/testhelper_test.go b/internal/git/housekeeping/testhelper_test.go index 4b8e922ad..4e7d1a291 100644 --- a/internal/git/housekeeping/testhelper_test.go +++ b/internal/git/housekeeping/testhelper_test.go @@ -31,6 +31,7 @@ type objectsState struct { looseObjects uint64 packfiles uint64 cruftPacks uint64 + keepPacks uint64 hasBitmap bool hasMultiPackIndex bool hasMultiPackIndexBitmap bool @@ -46,6 +47,7 @@ func requireObjectsState(tb testing.TB, repo *localrepo.Repo, expectedState obje looseObjects: repoInfo.LooseObjects.Count, packfiles: repoInfo.Packfiles.Count, cruftPacks: repoInfo.Packfiles.CruftCount, + keepPacks: repoInfo.Packfiles.KeepCount, hasBitmap: repoInfo.Packfiles.Bitmap.Exists, hasMultiPackIndex: repoInfo.Packfiles.MultiPackIndex.Exists, hasMultiPackIndexBitmap: repoInfo.Packfiles.MultiPackIndexBitmap.Exists, |