diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-07-12 11:48:14 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-07-15 09:57:06 +0300 |
commit | 8c72787d912d6deda52ea90352e35950bc3f1e48 (patch) | |
tree | 2b61a089666b7960429d609157179a8bcc4aa0de | |
parent | 84e99923567045f71b90b804f07f81ea4fef977d (diff) |
housekeeping: Repack object pools more aggressively
Object pools and normal repositories currently share the same heuristics
to decide whether to repack objects or not. While this strategy works,
it is not ideal for pool repositories because they may be reused across
many forks. It is thus desirable for us to keep them as efficient as
reasonably possible.
Tweak the heuristics to repack object pools more aggressively. Right now
this shouldn't make much of a difference: object pools are only updated
via `FetchIntoObjectPool()`, which knows to do a full repack when it has
fetched objects. As a result, no object pool should ever have more than
a single packfile in general.
This will make a difference though when we migrate object pools to not
do their own housekeeping anymore, but to use `OptimizeRepository()`
instead: we'll want to keep similar semantics as we have right now and
aggressively repack.
Note that we also need to fix some of our tests which used git-repack(1)
with the `-A` flag to bring repositories into the state we want to test
repository maintenance on. This will leave behind the old, superseded
packfile though and thus we end up with two packfiles even though the
intent of the test was to only test with a single packfile. This didn't
matter before where we required at least five packfiles before we pack
objects, but does now where the limit for object pools is at two packs.
-rw-r--r-- | internal/git/housekeeping/optimize_repository.go | 35 | ||||
-rw-r--r-- | internal/git/housekeeping/optimize_repository_test.go | 78 |
2 files changed, 81 insertions, 32 deletions
diff --git a/internal/git/housekeeping/optimize_repository.go b/internal/git/housekeeping/optimize_repository.go index 4bd48999a..ef74f8f0d 100644 --- a/internal/git/housekeeping/optimize_repository.go +++ b/internal/git/housekeeping/optimize_repository.go @@ -215,25 +215,36 @@ func needsRepacking(repo *localrepo.Repo) (bool, RepackObjectsConfig, error) { // relative though: for small repositories it's fine to do full repacks regularly, but for // large repositories we need to be more careful. We thus use a heuristic of "repository // largeness": we take the biggest packfile that exists, and then the maximum allowed number - // of packfiles is `log(largestpackfile_size_in_mb) / log(1.3)`. This gives the following - // allowed number of packfiles: + // of packfiles is `log(largestpackfile_size_in_mb) / log(1.3)` for normal repositories and + // `log(largestpackfile_size_in_mb) / log(10.0)` for pools. This gives the following allowed + // number of packfiles: // - // - No packfile: 5 packfile. This is a special case. - // - 10MB packfile: 8 packfiles. - // - 100MB packfile: 17 packfiles. - // - 500MB packfile: 23 packfiles. - // - 1GB packfile: 26 packfiles. - // - 5GB packfile: 32 packfiles. - // - 10GB packfile: 35 packfiles. - // - 100GB packfile: 43 packfiles. + // ------------------------------------------------------------------------------------- + // | largest packfile size | allowed packfiles for repos | allowed packfiles for pools | + // ------------------------------------------------------------------------------------- + // | none or <10MB | 5 | 2 | + // | 10MB | 8 | 2 | + // | 100MB | 17 | 2 | + // | 500MB | 23 | 2 | + // | 1GB | 26 | 3 | + // | 5GB | 32 | 3 | + // | 10GB | 35 | 4 | + // | 100GB | 43 | 5 | + // ------------------------------------------------------------------------------------- // // The goal is to have a comparatively quick ramp-up of allowed packfiles as the repository // size grows, but then slow down such that we're effectively capped and don't end up with - // an excessive amount of packfiles. + // an excessive amount of packfiles. On the other hand, pool repositories are potentially + // reused as basis for many forks and should thus be packed much more aggressively. // // This is a heuristic and thus imperfect by necessity. We may tune it as we gain experience // with the way it behaves. - if int64(math.Max(5, math.Log(float64(largestPackfileSize))/math.Log(1.3))) <= packfileCount { + lowerLimit, log := 5.0, 1.3 + if IsPoolRepository(repo) { + lowerLimit, log = 2.0, 10.0 + } + + if int64(math.Max(lowerLimit, math.Log(float64(largestPackfileSize))/math.Log(log))) <= packfileCount { return true, RepackObjectsConfig{ FullRepack: true, WriteBitmap: !hasAlternate, diff --git a/internal/git/housekeeping/optimize_repository_test.go b/internal/git/housekeeping/optimize_repository_test.go index 59bbd5cd1..939dab9b1 100644 --- a/internal/git/housekeeping/optimize_repository_test.go +++ b/internal/git/housekeeping/optimize_repository_test.go @@ -148,36 +148,44 @@ func TestNeedsRepacking(t *testing.T) { const megaByte = 1024 * 1024 for _, tc := range []struct { - packfileSize int64 - requiredPackfiles int + packfileSize int64 + requiredPackfiles int + requiredPackfilesForPool int }{ { - packfileSize: 1, - requiredPackfiles: 5, + packfileSize: 1, + requiredPackfiles: 5, + requiredPackfilesForPool: 2, }, { - packfileSize: 5 * megaByte, - requiredPackfiles: 6, + packfileSize: 5 * megaByte, + requiredPackfiles: 6, + requiredPackfilesForPool: 2, }, { - packfileSize: 10 * megaByte, - requiredPackfiles: 8, + packfileSize: 10 * megaByte, + requiredPackfiles: 8, + requiredPackfilesForPool: 2, }, { - packfileSize: 50 * megaByte, - requiredPackfiles: 14, + packfileSize: 50 * megaByte, + requiredPackfiles: 14, + requiredPackfilesForPool: 2, }, { - packfileSize: 100 * megaByte, - requiredPackfiles: 17, + packfileSize: 100 * megaByte, + requiredPackfiles: 17, + requiredPackfilesForPool: 2, }, { - packfileSize: 500 * megaByte, - requiredPackfiles: 23, + packfileSize: 500 * megaByte, + requiredPackfiles: 23, + requiredPackfilesForPool: 2, }, { - packfileSize: 1000 * megaByte, - requiredPackfiles: 26, + packfileSize: 1001 * megaByte, + requiredPackfiles: 26, + requiredPackfilesForPool: 3, }, // Let's not go any further than this, we're thrashing the temporary directory. } { @@ -203,10 +211,15 @@ func TestNeedsRepacking(t *testing.T) { require.NoError(t, os.WriteFile(bigPackPath, nil, 0o644)) require.NoError(t, os.Truncate(bigPackPath, tc.packfileSize)) + requiredPackfiles := tc.requiredPackfiles + if IsPoolRepository(repoProto) { + requiredPackfiles = tc.requiredPackfilesForPool + } + // And then we create one less packfile than we need to hit the boundary. // This is done to assert that we indeed don't repack before hitting the // boundary. - for i := 0; i < tc.requiredPackfiles-2; i++ { + for i := 0; i < requiredPackfiles-2; i++ { additionalPackfile, err := os.Create(filepath.Join(packDir, fmt.Sprintf("%d.pack", i))) require.NoError(t, err) testhelper.MustClose(t, additionalPackfile) @@ -543,11 +556,36 @@ gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 repo, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0], gittest.CloneRepoOpts{ RelativePath: relativePath, }) + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-A", "-d", "--write-bitmap-index") + return repo + }, + expectedMetrics: `# HELP gitaly_housekeeping_tasks_total Total number of housekeeping tasks performed in the repository +# TYPE gitaly_housekeeping_tasks_total counter +gitaly_housekeeping_tasks_total{housekeeping_task="packed_objects_full", status="success"} 1 +gitaly_housekeeping_tasks_total{housekeeping_task="written_bitmap", status="success"} 1 +gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 +`, + }, + { + desc: "repository with multiple packfiles packs only for object pool", + setup: func(t *testing.T, relativePath string) *gitalypb.Repository { + repo, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0], gittest.CloneRepoOpts{ + RelativePath: relativePath, + }) + + // Note: git-repack(1) without "-d" will _not_ delete the old + // packfile and thus end up with two packfiles. gittest.Exec(t, cfg, "-C", repoPath, "repack", "-A", "--write-bitmap-index") + gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--split", "--changed-paths") + return repo }, expectedMetrics: `# HELP gitaly_housekeeping_tasks_total Total number of housekeeping tasks performed in the repository # TYPE gitaly_housekeeping_tasks_total counter +gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 +`, + expectedMetricsForPool: `# HELP gitaly_housekeeping_tasks_total Total number of housekeeping tasks performed in the repository +# TYPE gitaly_housekeeping_tasks_total counter gitaly_housekeeping_tasks_total{housekeeping_task="packed_objects_full", status="success"} 1 gitaly_housekeeping_tasks_total{housekeeping_task="written_bitmap", status="success"} 1 gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 @@ -559,7 +597,7 @@ gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 repo, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0], gittest.CloneRepoOpts{ RelativePath: relativePath, }) - gittest.Exec(t, cfg, "-C", repoPath, "repack", "-A", "--write-bitmap-index") + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-A", "-d", "--write-bitmap-index") gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--split", "--changed-paths") return repo }, @@ -574,7 +612,7 @@ gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 repo, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0], gittest.CloneRepoOpts{ RelativePath: relativePath, }) - gittest.Exec(t, cfg, "-C", repoPath, "repack", "-A", "--write-bitmap-index") + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-A", "-d", "--write-bitmap-index") gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--split", "--changed-paths") // The repack won't repack the following objects because they're @@ -606,7 +644,7 @@ gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 repo, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0], gittest.CloneRepoOpts{ RelativePath: relativePath, }) - gittest.Exec(t, cfg, "-C", repoPath, "repack", "-A", "--write-bitmap-index") + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-A", "-d", "--write-bitmap-index") gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--split", "--changed-paths") // The repack won't repack the following objects because they're |