diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-02-07 12:05:17 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-02-10 10:44:03 +0300 |
commit | fb336f03024d99927804cc3781a55996ccd817ad (patch) | |
tree | 27990302272591a1b177fe84fa4d3df5a2509454 | |
parent | bdddc1e4f9011068d14883a3377525fef0e9141b (diff) |
repository: Use heuristic for full repacks in OptimizeRepository
Right now we're only repacking repositories in OptimizeRepository when
they're missing auxiliary caches like bitmaps or commit-graphs with
bloom filters. While this has been helpful in rolling out the generation
of these caches, ultimately we're advancing to a state where every repo
should have both of them. As a result, calling OptimizeRepsiotory is
becoming more and more of a no-op over time. What we want is for this
RPC to not only repack repositories if they're lacking crucial caches,
but also when we can determine that they're in an inefficient state.
Introduce a new heuristic which determines whether a repository needs a
full repack. A full repack will collapse all the packfiles we have into
a single packfile, which allows Git to look up objects without having to
search through all packfiles. Furthermore, it allows us to deltify more
objects because deltas can only be created for objects which reside in
the same packfile.
Doing a full repack of the repository scales with the number of objects
which exist in the repository and with their total size: the bigger the
repository is, the longer it takes. As a consequence, we need to become
more careful with repacking the repository as it grows given that we'd
otherwise be repacking it all the time.
To address this we use a heuristic which estimates the repository's size
based on the biggest packfile that exists in the repository. While this
is not an accurate reflection of repository size, it is fast to compute.
From here on we use a logarithmic function to grow the boundary of how
many packfiles need to exist before we do a full repack. This has the
effect that we're initially scaling up the number of allowed packfiles
quite fast, but decellerate when we approach repositories where the
packfiles range in the gigabytes.
This heuristic is imperfect by necessity, but it should give us a first
good iteration of this. Note that this is only a stop-gap solution
anyway: eventually, we want to migrate away from the full/incremental
repack split and always use multi-pack indices and geometric repacking,
which solves this problem of huge repositories in a much saner way.
Changelog: changed
-rw-r--r-- | internal/gitaly/service/repository/optimize.go | 85 | ||||
-rw-r--r-- | internal/gitaly/service/repository/optimize_test.go | 98 |
2 files changed, 183 insertions, 0 deletions
diff --git a/internal/gitaly/service/repository/optimize.go b/internal/gitaly/service/repository/optimize.go index 8a4829da1..1d5a84202 100644 --- a/internal/gitaly/service/repository/optimize.go +++ b/internal/gitaly/service/repository/optimize.go @@ -2,8 +2,12 @@ package repository import ( "context" + "errors" "fmt" + "math" "os" + "path/filepath" + "strings" "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" "gitlab.com/gitlab-org/gitaly/v14/internal/git/housekeeping" @@ -138,5 +142,86 @@ func needsRepacking(repo *localrepo.Repo) (bool, repackCommandConfig, error) { }, nil } + largestPackfileSize, packfileCount, err := packfileSizeAndCount(repo) + if err != nil { + return false, repackCommandConfig{}, fmt.Errorf("checking largest packfile size: %w", err) + } + + // Whenever we do an incremental repack we create a new packfile, and as a result Git may + // have to look into every one of the packfiles to find objects. This is less efficient the + // more packfiles we have, but we cannot repack the whole repository every time either given + // that this may take a lot of time. + // + // Instead, we determine whether the repository has "too many" packfiles. "Too many" is + // relative though: for small repositories it's fine to do full repacks regularly, but for + // large repositories we need to be more careful. We thus use a heuristic of "repository + // largeness": we take the biggest packfile that exists, and then the maximum allowed number + // of packfiles is `log(largestpackfile_size_in_mb) / log(1.3)`. This gives the following + // allowed number of packfiles: + // + // - No packfile: 5 packfile. This is a special case. + // - 10MB packfile: 8 packfiles. + // - 100MB packfile: 17 packfiles. + // - 500MB packfile: 23 packfiles. + // - 1GB packfile: 26 packfiles. + // - 5GB packfile: 32 packfiles. + // - 10GB packfile: 35 packfiles. + // - 100GB packfile: 43 packfiles. + // + // The goal is to have a comparatively quick ramp-up of allowed packfiles as the repository + // size grows, but then slow down such that we're effectively capped and don't end up with + // an excessive amount of packfiles. + // + // This is a heuristic and thus imperfect by necessity. We may tune it as we gain experience + // with the way it behaves. + if int64(math.Max(5, math.Log(float64(largestPackfileSize))/math.Log(1.3))) < packfileCount { + return true, repackCommandConfig{ + fullRepack: true, + writeBitmap: !hasAlternate, + }, nil + } + return false, repackCommandConfig{}, nil } + +func packfileSizeAndCount(repo *localrepo.Repo) (int64, int64, error) { + repoPath, err := repo.Path() + if err != nil { + return 0, 0, fmt.Errorf("getting repository path: %w", err) + } + + entries, err := os.ReadDir(filepath.Join(repoPath, "objects/pack")) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return 0, 0, nil + } + + return 0, 0, err + } + + largestSize := int64(0) + count := int64(0) + + for _, entry := range entries { + if !strings.HasSuffix(entry.Name(), ".pack") { + continue + } + + entryInfo, err := entry.Info() + if err != nil { + if errors.Is(err, os.ErrNotExist) { + continue + } + + return 0, 0, fmt.Errorf("getting packfile info: %w", err) + } + + if entryInfo.Size() > largestSize { + largestSize = entryInfo.Size() + } + + count++ + } + + return largestSize / 1024 / 1024, count, nil +} diff --git a/internal/gitaly/service/repository/optimize_test.go b/internal/gitaly/service/repository/optimize_test.go index 59d1f4e35..200f67eae 100644 --- a/internal/gitaly/service/repository/optimize_test.go +++ b/internal/gitaly/service/repository/optimize_test.go @@ -2,6 +2,8 @@ package repository import ( "bytes" + "fmt" + "io" "os" "path/filepath" "testing" @@ -188,6 +190,15 @@ func TestOptimizeRepositoryValidation(t *testing.T) { require.NoError(t, err) } +type infiniteReader struct{} + +func (r infiniteReader) Read(b []byte) (int, error) { + for i := range b { + b[i] = '\000' + } + return len(b), nil +} + func TestNeedsRepacking(t *testing.T) { t.Parallel() @@ -313,4 +324,91 @@ func TestNeedsRepacking(t *testing.T) { require.Equal(t, tc.expectedConfig, repackCfg) }) } + + const megaByte = 1024 * 1024 + + for _, tc := range []struct { + packfileSize int64 + requiredPackfiles int + }{ + { + packfileSize: 1, + requiredPackfiles: 5, + }, + { + packfileSize: 5 * megaByte, + requiredPackfiles: 6, + }, + { + packfileSize: 10 * megaByte, + requiredPackfiles: 8, + }, + { + packfileSize: 50 * megaByte, + requiredPackfiles: 14, + }, + { + packfileSize: 100 * megaByte, + requiredPackfiles: 17, + }, + { + packfileSize: 500 * megaByte, + requiredPackfiles: 23, + }, + { + packfileSize: 1000 * megaByte, + requiredPackfiles: 26, + }, + // Let's not go any further than this, we're thrashing the temporary directory. + } { + t.Run(fmt.Sprintf("packfile with %d bytes", tc.packfileSize), func(t *testing.T) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + packDir := filepath.Join(repoPath, "objects", "pack") + + // Emulate the existence of a bitmap and a commit-graph with bloom filters. + // We explicitly don't want to generate them via Git commands as they would + // require us to already have objects in the repository, and we want to be + // in full control over all objects and packfiles in the repo. + require.NoError(t, os.WriteFile(filepath.Join(packDir, "something.bitmap"), nil, 0o644)) + commitGraphChainPath := filepath.Join(repoPath, stats.CommitGraphChainRelPath) + require.NoError(t, os.MkdirAll(filepath.Dir(commitGraphChainPath), 0o755)) + require.NoError(t, os.WriteFile(commitGraphChainPath, nil, 0o644)) + + // We first create a single big packfile which is used to determine the + // boundary of when we repack. + bigPackfile, err := os.OpenFile(filepath.Join(packDir, "big.pack"), os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644) + require.NoError(t, err) + defer testhelper.MustClose(t, bigPackfile) + _, err = io.Copy(bigPackfile, io.LimitReader(infiniteReader{}, tc.packfileSize)) + require.NoError(t, err) + + // And then we create one less packfile than we need to hit the boundary. + // This is done to assert that we indeed don't repack before hitting the + // boundary. + for i := 0; i < tc.requiredPackfiles-1; i++ { + additionalPackfile, err := os.Create(filepath.Join(packDir, fmt.Sprintf("%d.pack", i))) + require.NoError(t, err) + testhelper.MustClose(t, additionalPackfile) + } + + repackNeeded, _, err := needsRepacking(repo) + require.NoError(t, err) + require.False(t, repackNeeded) + + // Now we create the additional packfile that causes us to hit the boundary. + // We should thus see that we want to repack now. + lastPackfile, err := os.Create(filepath.Join(packDir, "last.pack")) + require.NoError(t, err) + testhelper.MustClose(t, lastPackfile) + + repackNeeded, repackCfg, err := needsRepacking(repo) + require.NoError(t, err) + require.True(t, repackNeeded) + require.Equal(t, repackCommandConfig{ + fullRepack: true, + writeBitmap: true, + }, repackCfg) + }) + } } |