diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-07-08 11:47:22 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-07-15 16:15:09 +0300 |
commit | aba6f57d36e45034fd2c516b964a9ff4bc3528d7 (patch) | |
tree | 85896ba94bb0379d01f72bf2dce9bebd5e8e1d47 | |
parent | 219ce8a325bb722e13ab70539dc1bc70a44c37e6 (diff) |
housekeeping: Improve heuristics for writing commit-graphs
Now that writing commit-graphs has been disentangled from repacking
objects we can iterate on the heuristics we use to determine whether we
need to write commit-graphs:
- We stop writing commit-graphs in case there are no references.
This wouldn't make any sense anyway given that we use only write
commit-graphs for reachable commits, of which there aren't any in
case there are no references.
- We stop repacking objects based on whether the repository has
bloom filters or not. This logic was previously mixed into the
heuristics for `RepackObjects()`, but that has only be a historic
artifact because that function handled both repacks and rewrites
of commit-graphs. Instead, we only rewrite commit-graphs based on
that information.
Ultimately, the end result should be that we repack objects less often,
but keep rewriting commit-graphs in the same circumstances as we did
before.
Add tests to verify this new behaviour and adjust existing tests for
repacking of objects.
-rw-r--r-- | internal/git/housekeeping/optimize_repository.go | 87 | ||||
-rw-r--r-- | internal/git/housekeeping/optimize_repository_ext_test.go | 5 | ||||
-rw-r--r-- | internal/git/housekeeping/optimize_repository_test.go | 164 |
3 files changed, 209 insertions, 47 deletions
diff --git a/internal/git/housekeeping/optimize_repository.go b/internal/git/housekeeping/optimize_repository.go index a6ec186f9..28f984287 100644 --- a/internal/git/housekeeping/optimize_repository.go +++ b/internal/git/housekeeping/optimize_repository.go @@ -88,12 +88,10 @@ func optimizeRepository(ctx context.Context, m *RepositoryManager, repo *localre timer.ObserveDuration() timer = prometheus.NewTimer(m.tasksLatency.WithLabelValues("commit-graph")) - if didRepack { - if err := WriteCommitGraph(ctx, repo); err != nil { - optimizations["written_commit_graph"] = "failure" - return fmt.Errorf("could not write commit-graph: %w", err) - } - + if didWriteCommitGraph, err := writeCommitGraphIfNeeded(ctx, repo, didRepack); err != nil { + optimizations["written_commit_graph"] = "failure" + return fmt.Errorf("could not write commit-graph: %w", err) + } else if didWriteCommitGraph { optimizations["written_commit_graph"] = "success" } timer.ObserveDuration() @@ -196,26 +194,6 @@ func needsRepacking(repo *localrepo.Repo) (bool, RepackObjectsConfig, error) { }, nil } - missingBloomFilters, err := stats.IsMissingBloomFilters(repoPath) - if err != nil { - return false, RepackObjectsConfig{}, fmt.Errorf("checking for bloom filters: %w", err) - } - - // Bloom filters are part of the commit-graph and allow us to efficiently determine which - // paths have been modified in a given commit without having to look into the object - // database. In the past we didn't compute bloom filters at all, so we want to rewrite the - // whole commit-graph to generate them. - // - // Note that we'll eventually want to move out commit-graph generation from repacking. When - // that happens we should update the commit-graph either if it's missing, when bloom filters - // are missing or when packfiles have been updated. - if missingBloomFilters { - return true, RepackObjectsConfig{ - FullRepack: true, - WriteBitmap: !hasAlternate, - }, nil - } - // Whenever we do an incremental repack we create a new packfile, and as a result Git may // have to look into every one of the packfiles to find objects. This is less efficient the // more packfiles we have, but we cannot repack the whole repository every time either given @@ -375,6 +353,63 @@ func estimateLooseObjectCount(repo *localrepo.Repo, cutoffDate time.Time) (int64 return looseObjects * 256, nil } +// writeCommitGraphIfNeeded writes the commit-graph if required. +func writeCommitGraphIfNeeded(ctx context.Context, repo *localrepo.Repo, didRepack bool) (bool, error) { + needed, err := needsWriteCommitGraph(ctx, repo, didRepack) + if err != nil { + return false, fmt.Errorf("determining whether repo needs commit-graph update: %w", err) + } + if !needed { + return false, nil + } + + if err := WriteCommitGraph(ctx, repo); err != nil { + return true, fmt.Errorf("writing commit-graph: %w", err) + } + + return true, nil +} + +// needsWriteCommitGraph determines whether we need to write the commit-graph. +func needsWriteCommitGraph(ctx context.Context, repo *localrepo.Repo, didRepack bool) (bool, error) { + looseRefs, packedRefsSize, err := countLooseAndPackedRefs(ctx, repo) + if err != nil { + return false, fmt.Errorf("counting refs: %w", err) + } + + // If the repository doesn't have any references at all then there is no point in writing + // commit-graphs given that it would only contain reachable objects, of which there are + // none. + if looseRefs == 0 && packedRefsSize == 0 { + return false, nil + } + + // When we repacked the repository then chances are high that we have accumulated quite some + // objects since the last time we wrote a commit-graph. + if didRepack { + return true, nil + } + + repoPath, err := repo.Path() + if err != nil { + return false, fmt.Errorf("getting repository path: %w", err) + } + + // Bloom filters are part of the commit-graph and allow us to efficiently determine which + // paths have been modified in a given commit without having to look into the object + // database. In the past we didn't compute bloom filters at all, so we want to rewrite the + // whole commit-graph to generate them. + missingBloomFilters, err := stats.IsMissingBloomFilters(repoPath) + if err != nil { + return false, fmt.Errorf("checking for bloom filters: %w", err) + } + if missingBloomFilters { + return true, nil + } + + return false, nil +} + // pruneIfNeeded removes objects from the repository which are either unreachable or which are // already part of a packfile. We use a grace period of two weeks. func pruneIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) { diff --git a/internal/git/housekeeping/optimize_repository_ext_test.go b/internal/git/housekeeping/optimize_repository_ext_test.go index 7eae05993..13940b9b6 100644 --- a/internal/git/housekeeping/optimize_repository_ext_test.go +++ b/internal/git/housekeeping/optimize_repository_ext_test.go @@ -120,9 +120,8 @@ func TestPruneIfNeeded(t *testing.T) { require.NoError(t, housekeeping.NewManager(cfg.Prometheus, nil).OptimizeRepository(ctx, repo)) expectedLogEntry := map[string]string{ - "packed_objects_full": "success", - "written_commit_graph": "success", - "written_bitmap": "success", + "packed_objects_full": "success", + "written_bitmap": "success", } if tc.expectedPrune { diff --git a/internal/git/housekeeping/optimize_repository_test.go b/internal/git/housekeeping/optimize_repository_test.go index 453dd4ca7..09c5476c4 100644 --- a/internal/git/housekeeping/optimize_repository_test.go +++ b/internal/git/housekeeping/optimize_repository_test.go @@ -73,11 +73,10 @@ func TestNeedsRepacking(t *testing.T) { return repoProto }, - expectedNeeded: true, - expectedConfig: RepackObjectsConfig{ - FullRepack: true, - WriteBitmap: false, - }, + // If we have no bitmap in the repository we'd normally want to fully repack + // the repository. But because we have an alternates file we know that the + // repository must not have a bitmap anyway, so we can skip the repack here. + expectedNeeded: false, }, { desc: "missing commit-graph", @@ -90,11 +89,10 @@ func TestNeedsRepacking(t *testing.T) { return repoProto }, - expectedNeeded: true, - expectedConfig: RepackObjectsConfig{ - FullRepack: true, - WriteBitmap: true, - }, + // The commit-graph used to influence whether we repacked a repository or + // not. That was due to historic reasons only, though, and ultimately does + // not make any sense. + expectedNeeded: false, }, { desc: "commit-graph without bloom filters", @@ -108,11 +106,10 @@ func TestNeedsRepacking(t *testing.T) { return repoProto }, - expectedNeeded: true, - expectedConfig: RepackObjectsConfig{ - FullRepack: true, - WriteBitmap: true, - }, + // The commit-graph used to influence whether we repacked a repository or + // not. That was due to historic reasons only, though, and ultimately does + // not make any sense. + expectedNeeded: false, }, { desc: "no repack needed", @@ -552,7 +549,7 @@ gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 `, }, { - desc: "repository without commit-graph repacks objects", + desc: "repository without commit-graph writes commit-graph", setup: func(t *testing.T, relativePath string) *gitalypb.Repository { repo, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0], gittest.CloneRepoOpts{ RelativePath: relativePath, @@ -562,9 +559,7 @@ gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 }, expectedMetrics: `# HELP gitaly_housekeeping_tasks_total Total number of housekeeping tasks performed in the repository # TYPE gitaly_housekeeping_tasks_total counter -gitaly_housekeeping_tasks_total{housekeeping_task="packed_objects_full", status="success"} 1 gitaly_housekeeping_tasks_total{housekeeping_task="written_commit_graph", status="success"} 1 -gitaly_housekeeping_tasks_total{housekeeping_task="written_bitmap", status="success"} 1 gitaly_housekeeping_tasks_total{housekeeping_task="total", status="success"} 1 `, }, @@ -973,3 +968,136 @@ func TestPruneIfNeeded(t *testing.T) { } }) } + +func TestWriteCommitGraphIfNeeded(t *testing.T) { + ctx := testhelper.Context(t) + cfg := testcfg.Build(t) + + for _, tc := range []struct { + desc string + setup func(t *testing.T) (*gitalypb.Repository, string) + didRepack bool + expectedWrite bool + expectedCommitGraph bool + }{ + { + desc: "empty repository", + setup: func(t *testing.T) (*gitalypb.Repository, string) { + return gittest.InitRepo(t, cfg, cfg.Storages[0]) + }, + didRepack: true, + expectedWrite: false, + }, + { + desc: "repository with objects but no refs", + setup: func(t *testing.T) (*gitalypb.Repository, string) { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + gittest.WriteBlob(t, cfg, repoPath, []byte("something")) + return repoProto, repoPath + }, + didRepack: true, + expectedWrite: false, + }, + { + desc: "repository without commit-graph", + setup: func(t *testing.T) (*gitalypb.Repository, string) { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents(), gittest.WithBranch("main")) + return repoProto, repoPath + }, + expectedWrite: true, + expectedCommitGraph: true, + }, + { + desc: "repository with old-style unsplit commit-graph", + setup: func(t *testing.T) (*gitalypb.Repository, string) { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents(), gittest.WithBranch("main")) + + // Write a non-split commit-graph with bloom filters. We should + // always rewrite the commit-graphs when we're not using a split + // commit-graph. We make sure to add bloom filters via + // `--changed-paths` given that it would otherwise cause us to + // rewrite the graph regardless of whether the graph is split or not + // if they were missing. + gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--reachable", "--changed-paths") + + return repoProto, repoPath + }, + expectedWrite: true, + expectedCommitGraph: true, + }, + { + desc: "repository with split commit-graph without bitmap", + setup: func(t *testing.T) (*gitalypb.Repository, string) { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents(), gittest.WithBranch("main")) + + // Generate a split commit-graph, but don't enable computation of + // changed paths. This should trigger a rewrite so that we can + // recompute all graphs and generate the changed paths. + gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--reachable", "--split") + + return repoProto, repoPath + }, + expectedWrite: true, + expectedCommitGraph: true, + }, + { + desc: "repository with split commit-graph with bitmap without repack", + setup: func(t *testing.T) (*gitalypb.Repository, string) { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents(), gittest.WithBranch("main")) + + // Write a split commit-graph with bitmaps. This is the state we + // want to be in. + gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--reachable", "--split", "--changed-paths") + + return repoProto, repoPath + }, + // We use the information about whether we repacked objects as an indicator + // whether something has changed in the repository. If it didn't, then we + // assume no new objects exist and thus we don't rewrite the commit-graph. + didRepack: false, + expectedWrite: false, + expectedCommitGraph: true, + }, + { + desc: "repository with split commit-graph with bitmap with repack", + setup: func(t *testing.T) (*gitalypb.Repository, string) { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents(), gittest.WithBranch("main")) + + // Write a split commit-graph with bitmaps. This is the state we + // want to be in, so there is no write required if we didn't also + // repack objects. + gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--reachable", "--split", "--changed-paths") + + return repoProto, repoPath + }, + // When we have a valid commit-graph, but objects have been repacked, we + // assume that there are new objects in the repository. So consequentially, + // we should write the commit-graphs. + didRepack: true, + expectedWrite: true, + expectedCommitGraph: true, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + repoProto, repoPath := tc.setup(t) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + didWrite, err := writeCommitGraphIfNeeded(ctx, repo, tc.didRepack) + require.NoError(t, err) + require.Equal(t, tc.expectedWrite, didWrite) + + commitGraphPath := filepath.Join(repoPath, "objects", "info", "commit-graphs", "commit-graph-chain") + if tc.expectedCommitGraph { + require.FileExists(t, commitGraphPath) + } else { + require.NoFileExists(t, commitGraphPath) + } + gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "verify") + }) + } +} |