diff options
author | Toon Claes <toon@gitlab.com> | 2022-02-10 12:22:20 +0300 |
---|---|---|
committer | Toon Claes <toon@gitlab.com> | 2022-02-10 12:22:20 +0300 |
commit | 3074a5673df93a6a765c2bdde84bd4f6f670afcb (patch) | |
tree | 000ff6b969004853218c94b6a416134db73ccc26 | |
parent | d9e05a74f672aa10f5e3449d4bae433995974471 (diff) | |
parent | 9d5181b803f5f4720ded8e070db13d6c1fe73123 (diff) |
Merge branch 'pks-optimize-repository-heuristic-optimizations' into 'master'
Extend heuristic optimizations in OptimizeRepository
Closes #2296
See merge request gitlab-org/gitaly!4332
-rw-r--r-- | internal/git/command_description.go | 3 | ||||
-rw-r--r-- | internal/gitaly/service/repository/cleanup.go | 35 | ||||
-rw-r--r-- | internal/gitaly/service/repository/cleanup_test.go | 29 | ||||
-rw-r--r-- | internal/gitaly/service/repository/optimize.go | 404 | ||||
-rw-r--r-- | internal/gitaly/service/repository/optimize_test.go | 557 | ||||
-rw-r--r-- | internal/gitaly/service/repository/repack.go | 43 |
6 files changed, 1021 insertions, 50 deletions
diff --git a/internal/git/command_description.go b/internal/git/command_description.go index d4d5ec82d..a1a33c037 100644 --- a/internal/git/command_description.go +++ b/internal/git/command_description.go @@ -176,6 +176,9 @@ var commandDescriptions = map[string]commandDescription{ "pack-objects": { flags: scNoRefUpdates | scGeneratesPackfiles, }, + "prune": { + flags: scNoRefUpdates, + }, "push": { flags: scNoRefUpdates, opts: []GlobalOption{ diff --git a/internal/gitaly/service/repository/cleanup.go b/internal/gitaly/service/repository/cleanup.go index 3bb71faa0..8755c053d 100644 --- a/internal/gitaly/service/repository/cleanup.go +++ b/internal/gitaly/service/repository/cleanup.go @@ -127,6 +127,41 @@ func isExitWithCode(err error, code int) bool { } func cleanDisconnectedWorktrees(ctx context.Context, repo *localrepo.Repo) error { + repoPath, err := repo.Path() + if err != nil { + return err + } + + // Spawning a command is expensive. We thus try to avoid the overhead by first + // determining if there could possibly be any work to be done by git-worktree(1). We do so + // by reading the directory in which worktrees are stored, and if it's empty then we know + // that there aren't any worktrees in the first place. + worktreeEntries, err := os.ReadDir(filepath.Join(repoPath, "worktrees")) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil + } + } + + hasWorktrees := false + for _, worktreeEntry := range worktreeEntries { + if !worktreeEntry.IsDir() { + continue + } + + if worktreeEntry.Name() == "." || worktreeEntry.Name() == ".." { + continue + } + + hasWorktrees = true + break + } + + // There are no worktrees, so let's avoid spawning the Git command. + if !hasWorktrees { + return nil + } + return repo.ExecAndWait(ctx, git.SubSubCmd{ Name: "worktree", Action: "prune", diff --git a/internal/gitaly/service/repository/cleanup_test.go b/internal/gitaly/service/repository/cleanup_test.go index 215a57b11..9975a22ab 100644 --- a/internal/gitaly/service/repository/cleanup_test.go +++ b/internal/gitaly/service/repository/cleanup_test.go @@ -8,8 +8,10 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v14/internal/git" "gitlab.com/gitlab-org/gitaly/v14/internal/git/gittest" "gitlab.com/gitlab-org/gitaly/v14/internal/git/localrepo" + "gitlab.com/gitlab-org/gitaly/v14/internal/gitaly/config" "gitlab.com/gitlab-org/gitaly/v14/internal/testhelper" "gitlab.com/gitlab-org/gitaly/v14/internal/testhelper/testcfg" "gitlab.com/gitlab-org/gitaly/v14/proto/go/gitalypb" @@ -141,6 +143,33 @@ func TestCleanupDisconnectedWorktrees(t *testing.T) { gittest.AddWorktree(t, cfg, repoPath, worktreePath) } +func TestCleanupDisconnectedWorktrees_doesNothingWithoutWorktrees(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + cfg, repoProto, repoPath, _ := setupRepositoryService(ctx, t) + worktreePath := filepath.Join(testhelper.TempDir(t), "worktree") + + failingGitCmdFactory := gittest.NewInterceptingCommandFactory(ctx, t, cfg, func(git.ExecutionEnvironment) string { + return `#!/usr/bin/env bash + exit 15 + ` + }) + + repo := localrepo.New(config.NewLocator(cfg), failingGitCmdFactory, nil, repoProto) + + // If this command did spawn git-worktree(1) we'd see an error. It doesn't though because it + // detects that there aren't any worktrees at all. + require.NoError(t, cleanDisconnectedWorktrees(ctx, repo)) + + gittest.AddWorktree(t, cfg, repoPath, worktreePath) + + // We have now added a worktree now, so it should detect that there are worktrees and thus + // spawn the Git command. We thus expect the error code we inject via the failing Git + // command factory. + require.EqualError(t, cleanDisconnectedWorktrees(ctx, repo), "exit status 15") +} + func TestRemoveWorktree(t *testing.T) { t.Parallel() diff --git a/internal/gitaly/service/repository/optimize.go b/internal/gitaly/service/repository/optimize.go index 4bfbfb386..b13566036 100644 --- a/internal/gitaly/service/repository/optimize.go +++ b/internal/gitaly/service/repository/optimize.go @@ -1,10 +1,18 @@ package repository import ( + "bytes" "context" + "errors" "fmt" + "io/fs" + "math" "os" + "path/filepath" + "strings" + "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" + "gitlab.com/gitlab-org/gitaly/v14/internal/git" "gitlab.com/gitlab-org/gitaly/v14/internal/git/housekeeping" "gitlab.com/gitlab-org/gitaly/v14/internal/git/localrepo" "gitlab.com/gitlab-org/gitaly/v14/internal/git/stats" @@ -12,86 +20,404 @@ import ( "gitlab.com/gitlab-org/gitaly/v14/proto/go/gitalypb" ) +const ( + // looseObjectLimit is the limit of loose objects we accept both when doing incremental + // repacks and when pruning objects. + looseObjectLimit = 1024 +) + +func (s *server) OptimizeRepository(ctx context.Context, in *gitalypb.OptimizeRepositoryRequest) (*gitalypb.OptimizeRepositoryResponse, error) { + if err := s.validateOptimizeRepositoryRequest(in); err != nil { + return nil, err + } + + repo := s.localrepo(in.GetRepository()) + + if err := s.optimizeRepository(ctx, repo); err != nil { + return nil, helper.ErrInternal(err) + } + + return &gitalypb.OptimizeRepositoryResponse{}, nil +} + +func (s *server) validateOptimizeRepositoryRequest(in *gitalypb.OptimizeRepositoryRequest) error { + if in.GetRepository() == nil { + return helper.ErrInvalidArgumentf("empty repository") + } + + _, err := s.locator.GetRepoPath(in.GetRepository()) + if err != nil { + return err + } + + return nil +} + +func (s *server) optimizeRepository(ctx context.Context, repo *localrepo.Repo) error { + optimizations := struct { + PackedObjects bool `json:"packed_objects"` + PrunedObjects bool `json:"pruned_objects"` + PackedRefs bool `json:"packed_refs"` + }{} + defer func() { + ctxlogrus.Extract(ctx).WithField("optimizations", optimizations).Info("optimized repository") + }() + + if err := housekeeping.Perform(ctx, repo, s.txManager); err != nil { + return fmt.Errorf("could not execute houskeeping: %w", err) + } + + if err := cleanupWorktrees(ctx, repo); err != nil { + return fmt.Errorf("could not clean up worktrees: %w", err) + } + + didRepack, err := repackIfNeeded(ctx, repo) + if err != nil { + return fmt.Errorf("could not repack: %w", err) + } + optimizations.PackedObjects = didRepack + + didPrune, err := pruneIfNeeded(ctx, repo) + if err != nil { + return fmt.Errorf("could not prune: %w", err) + } + optimizations.PrunedObjects = didPrune + + didPackRefs, err := packRefsIfNeeded(ctx, repo) + if err != nil { + return fmt.Errorf("could not pack refs: %w", err) + } + optimizations.PackedRefs = didPackRefs + + return nil +} + // repackIfNeeded uses a set of heuristics to determine whether the repository needs a // full repack and, if so, repacks it. -func (s *server) repackIfNeeded(ctx context.Context, repo *localrepo.Repo, repoProto *gitalypb.Repository) error { +func repackIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) { + repackNeeded, cfg, err := needsRepacking(repo) + if err != nil { + return false, fmt.Errorf("determining whether repo needs repack: %w", err) + } + + if !repackNeeded { + return false, nil + } + + if err := repack(ctx, repo, cfg); err != nil { + return false, err + } + + return true, nil +} + +func needsRepacking(repo *localrepo.Repo) (bool, repackCommandConfig, error) { repoPath, err := repo.Path() if err != nil { - return err + return false, repackCommandConfig{}, fmt.Errorf("getting repository path: %w", err) + } + + altFile, err := repo.InfoAlternatesPath() + if err != nil { + return false, repackCommandConfig{}, helper.ErrInternal(err) + } + + hasAlternate := true + if _, err := os.Stat(altFile); os.IsNotExist(err) { + hasAlternate = false } hasBitmap, err := stats.HasBitmap(repoPath) if err != nil { - return helper.ErrInternal(err) + return false, repackCommandConfig{}, fmt.Errorf("checking for bitmap: %w", err) + } + + // Bitmaps are used to efficiently determine transitive reachability of objects from a + // set of commits. They are an essential part of the puzzle required to serve fetches + // efficiently, as we'd otherwise need to traverse the object graph every time to find + // which objects we have to send. We thus repack the repository with bitmaps enabled in + // case they're missing. + // + // There is one exception: repositories which are connected to an object pool must not have + // a bitmap on their own. We do not yet use multi-pack indices, and in that case Git can + // only use one bitmap. We already generate this bitmap in the pool, so member of it + // shouldn't have another bitmap on their own. + if !hasBitmap && !hasAlternate { + return true, repackCommandConfig{ + fullRepack: true, + writeBitmap: true, + }, nil } missingBloomFilters, err := stats.IsMissingBloomFilters(repoPath) if err != nil { - return helper.ErrInternal(err) + return false, repackCommandConfig{}, fmt.Errorf("checking for bloom filters: %w", err) } - if hasBitmap && !missingBloomFilters { - return nil + // Bloom filters are part of the commit-graph and allow us to efficiently determine which + // paths have been modified in a given commit without having to look into the object + // database. In the past we didn't compute bloom filters at all, so we want to rewrite the + // whole commit-graph to generate them. + // + // Note that we'll eventually want to move out commit-graph generation from repacking. When + // that happens we should update the commit-graph either if it's missing, when bloom filters + // are missing or when packfiles have been updated. + if missingBloomFilters { + return true, repackCommandConfig{ + fullRepack: true, + writeBitmap: !hasAlternate, + }, nil } - altFile, err := repo.InfoAlternatesPath() + largestPackfileSize, packfileCount, err := packfileSizeAndCount(repo) if err != nil { - return helper.ErrInternal(err) + return false, repackCommandConfig{}, fmt.Errorf("checking largest packfile size: %w", err) } - // Repositories with alternates should never have a bitmap, as Git will otherwise complain about - // multiple bitmaps being present in parent and alternate repository. - // In case of an error it still tries it is best to optimise the repository. - createBitMap := false - if _, err := os.Stat(altFile); os.IsNotExist(err) { - createBitMap = true + // Whenever we do an incremental repack we create a new packfile, and as a result Git may + // have to look into every one of the packfiles to find objects. This is less efficient the + // more packfiles we have, but we cannot repack the whole repository every time either given + // that this may take a lot of time. + // + // Instead, we determine whether the repository has "too many" packfiles. "Too many" is + // relative though: for small repositories it's fine to do full repacks regularly, but for + // large repositories we need to be more careful. We thus use a heuristic of "repository + // largeness": we take the biggest packfile that exists, and then the maximum allowed number + // of packfiles is `log(largestpackfile_size_in_mb) / log(1.3)`. This gives the following + // allowed number of packfiles: + // + // - No packfile: 5 packfile. This is a special case. + // - 10MB packfile: 8 packfiles. + // - 100MB packfile: 17 packfiles. + // - 500MB packfile: 23 packfiles. + // - 1GB packfile: 26 packfiles. + // - 5GB packfile: 32 packfiles. + // - 10GB packfile: 35 packfiles. + // - 100GB packfile: 43 packfiles. + // + // The goal is to have a comparatively quick ramp-up of allowed packfiles as the repository + // size grows, but then slow down such that we're effectively capped and don't end up with + // an excessive amount of packfiles. + // + // This is a heuristic and thus imperfect by necessity. We may tune it as we gain experience + // with the way it behaves. + if int64(math.Max(5, math.Log(float64(largestPackfileSize))/math.Log(1.3))) < packfileCount { + return true, repackCommandConfig{ + fullRepack: true, + writeBitmap: !hasAlternate, + }, nil } - if _, err = s.RepackFull(ctx, &gitalypb.RepackFullRequest{ - Repository: repoProto, - CreateBitmap: createBitMap, - }); err != nil { - return err + looseObjectCount, err := estimateLooseObjectCount(repo) + if err != nil { + return false, repackCommandConfig{}, fmt.Errorf("estimating loose object count: %w", err) } - return nil + // Most Git commands do not write packfiles directly, but instead write loose objects into + // the object database. So while we now know that there ain't too many packfiles, we still + // need to check whether we have too many objects. + // + // In this case it doesn't make a lot of sense to scale incremental repacks with the repo's + // size: we only pack loose objects, so the time to pack them doesn't scale with repository + // size but with the number of loose objects we have. git-gc(1) uses a threshold of 6700 + // loose objects to start an incremental repack, but one needs to keep in mind that Git + // typically has defaults which are better suited for the client-side instead of the + // server-side in most commands. + // + // In our case we typically want to ensure that our repositories are much better packed than + // it is necessary on the client side. We thus take a much stricter limit of 1024 objects. + if looseObjectCount > looseObjectLimit { + return true, repackCommandConfig{ + fullRepack: false, + writeBitmap: false, + }, nil + } + + return false, repackCommandConfig{}, nil } -func (s *server) optimizeRepository(ctx context.Context, repository *gitalypb.Repository) error { - repo := s.localrepo(repository) +func packfileSizeAndCount(repo *localrepo.Repo) (int64, int64, error) { + repoPath, err := repo.Path() + if err != nil { + return 0, 0, fmt.Errorf("getting repository path: %w", err) + } - if err := s.repackIfNeeded(ctx, repo, repository); err != nil { - return fmt.Errorf("could not repack: %w", err) + entries, err := os.ReadDir(filepath.Join(repoPath, "objects/pack")) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return 0, 0, nil + } + + return 0, 0, err } - if err := housekeeping.Perform(ctx, repo, s.txManager); err != nil { - return fmt.Errorf("could not execute houskeeping: %w", err) + largestSize := int64(0) + count := int64(0) + + for _, entry := range entries { + if !strings.HasSuffix(entry.Name(), ".pack") { + continue + } + + entryInfo, err := entry.Info() + if err != nil { + if errors.Is(err, os.ErrNotExist) { + continue + } + + return 0, 0, fmt.Errorf("getting packfile info: %w", err) + } + + if entryInfo.Size() > largestSize { + largestSize = entryInfo.Size() + } + + count++ } - return nil + return largestSize / 1024 / 1024, count, nil } -func (s *server) OptimizeRepository(ctx context.Context, in *gitalypb.OptimizeRepositoryRequest) (*gitalypb.OptimizeRepositoryResponse, error) { - if err := s.validateOptimizeRepositoryRequest(in); err != nil { - return nil, err +// estimateLooseObjectCount estimates the number of loose objects in the repository. Due to the +// object name being derived via a cryptographic hash we know that in the general case, objects are +// evenly distributed across their sharding directories. We can thus estimate the number of loose +// objects by opening a single sharding directory and counting its entries. +func estimateLooseObjectCount(repo *localrepo.Repo) (int64, error) { + repoPath, err := repo.Path() + if err != nil { + return 0, fmt.Errorf("getting repository path: %w", err) } - if err := s.optimizeRepository(ctx, in.GetRepository()); err != nil { - return nil, helper.ErrInternal(err) + // We use the same sharding directory as git-gc(1) does for its estimation. + entries, err := os.ReadDir(filepath.Join(repoPath, "objects/17")) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return 0, nil + } + + return 0, fmt.Errorf("reading loose object shard: %w", err) } - return &gitalypb.OptimizeRepositoryResponse{}, nil + looseObjects := int64(0) + for _, entry := range entries { + if strings.LastIndexAny(entry.Name(), "0123456789abcdef") != len(entry.Name())-1 { + continue + } + + looseObjects++ + } + + // Scale up found loose objects by the number of sharding directories. + return looseObjects * 256, nil } -func (s *server) validateOptimizeRepositoryRequest(in *gitalypb.OptimizeRepositoryRequest) error { - if in.GetRepository() == nil { - return helper.ErrInvalidArgumentf("empty repository") +// pruneIfNeeded removes objects from the repository which are either unreachable or which are +// already part of a packfile. We use a grace period of two weeks. +func pruneIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) { + // Pool repositories must never prune any objects, or otherwise we may corrupt members of + // that pool if they still refer to that object. + if strings.HasPrefix(repo.GetRelativePath(), "@pools") { + return false, nil } - _, err := s.locator.GetRepoPath(in.GetRepository()) + looseObjectCount, err := estimateLooseObjectCount(repo) if err != nil { - return err + return false, fmt.Errorf("estimating loose object count: %w", err) } - return nil + // We again use the same limit here as we do when doing an incremental repack. This is done + // intentionally: if we determine that there's too many loose objects and try to repack, but + // all of those loose objects are in fact unreachable, then we'd still have the same number + // of unreachable objects after the incremental repack. We'd thus try to repack every single + // time. + // + // Using the same limit here doesn't quite fix this case: the unreachable objects would only + // be pruned after a grace period of two weeks. But at least we know that we will eventually + // prune up those unreachable objects, at which point we won't try to do another incremental + // repack. + if looseObjectCount <= looseObjectLimit { + return false, nil + } + + if err := repo.ExecAndWait(ctx, git.SubCmd{ + Name: "prune", + }); err != nil { + return false, fmt.Errorf("pruning objects: %w", err) + } + + return true, nil +} + +func packRefsIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) { + repoPath, err := repo.Path() + if err != nil { + return false, fmt.Errorf("getting repository path: %w", err) + } + refsPath := filepath.Join(repoPath, "refs") + + looseRefs := int64(0) + if err := filepath.WalkDir(refsPath, func(path string, entry fs.DirEntry, err error) error { + if err != nil { + return err + } + + if !entry.IsDir() { + looseRefs++ + } + + return nil + }); err != nil { + return false, fmt.Errorf("counting loose refs: %w", err) + } + + // If there aren't any loose refs then there is nothing we need to do. + if looseRefs == 0 { + return false, nil + } + + packedRefsSize := int64(0) + if stat, err := os.Stat(filepath.Join(repoPath, "packed-refs")); err != nil { + if !errors.Is(err, os.ErrNotExist) { + return false, fmt.Errorf("getting packed-refs size: %w", err) + } + } else { + packedRefsSize = stat.Size() + } + + // Packing loose references into the packed-refs file scales with the number of references + // we're about to write. We thus decide whether we repack refs by weighing the current size + // of the packed-refs file against the number of loose references. This is done such that we + // do not repack too often on repositories with a huge number of references, where we can + // expect a lot of churn in the number of references. + // + // As a heuristic, we repack if the number of loose references in the repository exceeds + // `log(packed_refs_size_in_bytes/100)/log(1.15)`, which scales as following (number of refs + // is estimated with 100 bytes per reference): + // + // - 1kB ~ 10 packed refs: 16 refs + // - 10kB ~ 100 packed refs: 33 refs + // - 100kB ~ 1k packed refs: 49 refs + // - 1MB ~ 10k packed refs: 66 refs + // - 10MB ~ 100k packed refs: 82 refs + // - 100MB ~ 1m packed refs: 99 refs + // + // We thus allow roughly 16 additional loose refs per factor of ten of packed refs. + // + // This heuristic may likely need tweaking in the future, but should serve as a good first + // iteration. + if int64(math.Max(16, math.Log(float64(packedRefsSize)/100)/math.Log(1.15))) > looseRefs { + return false, nil + } + + var stderr bytes.Buffer + if err := repo.ExecAndWait(ctx, git.SubCmd{ + Name: "pack-refs", + Flags: []git.Option{ + git.Flag{Name: "--all"}, + }, + }, git.WithStderr(&stderr)); err != nil { + return false, fmt.Errorf("packing refs: %w, stderr: %q", err, stderr.String()) + } + + return true, nil } diff --git a/internal/gitaly/service/repository/optimize_test.go b/internal/gitaly/service/repository/optimize_test.go index c35ffa924..f7e4152e0 100644 --- a/internal/gitaly/service/repository/optimize_test.go +++ b/internal/gitaly/service/repository/optimize_test.go @@ -2,6 +2,8 @@ package repository import ( "bytes" + "fmt" + "io" "os" "path/filepath" "testing" @@ -10,6 +12,7 @@ import ( "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/v14/internal/git" "gitlab.com/gitlab-org/gitaly/v14/internal/git/gittest" + "gitlab.com/gitlab-org/gitaly/v14/internal/git/localrepo" "gitlab.com/gitlab-org/gitaly/v14/internal/git/stats" "gitlab.com/gitlab-org/gitaly/v14/internal/testhelper" "gitlab.com/gitlab-org/gitaly/v14/proto/go/gitalypb" @@ -186,3 +189,557 @@ func TestOptimizeRepositoryValidation(t *testing.T) { _, err := client.OptimizeRepository(ctx, &gitalypb.OptimizeRepositoryRequest{Repository: repo}) require.NoError(t, err) } + +type infiniteReader struct{} + +func (r infiniteReader) Read(b []byte) (int, error) { + for i := range b { + b[i] = '\000' + } + return len(b), nil +} + +func TestNeedsRepacking(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + cfg, _ := setupRepositoryServiceWithoutRepo(t) + + for _, tc := range []struct { + desc string + setup func(t *testing.T) *gitalypb.Repository + expectedErr error + expectedNeeded bool + expectedConfig repackCommandConfig + }{ + { + desc: "empty repo", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, _ := gittest.CreateRepository(ctx, t, cfg) + return repoProto + }, + // This is a bug: if the repo is empty then we wouldn't ever generate a + // packfile, but we determine a repack is needed because it's missing a + // bitmap. It's a rather benign bug though: git-repack(1) will exit + // immediately because it knows that there's nothing to repack. + expectedNeeded: true, + expectedConfig: repackCommandConfig{ + fullRepack: true, + writeBitmap: true, + }, + }, + { + desc: "missing bitmap", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, _ := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + Seed: gittest.SeedGitLabTest, + }) + return repoProto + }, + expectedNeeded: true, + expectedConfig: repackCommandConfig{ + fullRepack: true, + writeBitmap: true, + }, + }, + { + desc: "missing bitmap with alternate", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + Seed: gittest.SeedGitLabTest, + }) + + // Create the alternates file. If it exists, then we shouldn't try + // to generate a bitmap. + require.NoError(t, os.WriteFile(filepath.Join(repoPath, "objects", "info", "alternates"), nil, 0o755)) + + return repoProto + }, + expectedNeeded: true, + expectedConfig: repackCommandConfig{ + fullRepack: true, + writeBitmap: false, + }, + }, + { + desc: "missing commit-graph", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + Seed: gittest.SeedGitLabTest, + }) + + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "--write-bitmap-index") + + return repoProto + }, + expectedNeeded: true, + expectedConfig: repackCommandConfig{ + fullRepack: true, + writeBitmap: true, + }, + }, + { + desc: "commit-graph without bloom filters", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + Seed: gittest.SeedGitLabTest, + }) + + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "--write-bitmap-index") + gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write") + + return repoProto + }, + expectedNeeded: true, + expectedConfig: repackCommandConfig{ + fullRepack: true, + writeBitmap: true, + }, + }, + { + desc: "no repack needed", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + Seed: gittest.SeedGitLabTest, + }) + + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "--write-bitmap-index") + gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--changed-paths", "--split") + + return repoProto + }, + expectedNeeded: false, + }, + } { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + t.Parallel() + + repoProto := tc.setup(t) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + repackNeeded, repackCfg, err := needsRepacking(repo) + require.Equal(t, tc.expectedErr, err) + require.Equal(t, tc.expectedNeeded, repackNeeded) + require.Equal(t, tc.expectedConfig, repackCfg) + }) + } + + const megaByte = 1024 * 1024 + + for _, tc := range []struct { + packfileSize int64 + requiredPackfiles int + }{ + { + packfileSize: 1, + requiredPackfiles: 5, + }, + { + packfileSize: 5 * megaByte, + requiredPackfiles: 6, + }, + { + packfileSize: 10 * megaByte, + requiredPackfiles: 8, + }, + { + packfileSize: 50 * megaByte, + requiredPackfiles: 14, + }, + { + packfileSize: 100 * megaByte, + requiredPackfiles: 17, + }, + { + packfileSize: 500 * megaByte, + requiredPackfiles: 23, + }, + { + packfileSize: 1000 * megaByte, + requiredPackfiles: 26, + }, + // Let's not go any further than this, we're thrashing the temporary directory. + } { + t.Run(fmt.Sprintf("packfile with %d bytes", tc.packfileSize), func(t *testing.T) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + packDir := filepath.Join(repoPath, "objects", "pack") + + // Emulate the existence of a bitmap and a commit-graph with bloom filters. + // We explicitly don't want to generate them via Git commands as they would + // require us to already have objects in the repository, and we want to be + // in full control over all objects and packfiles in the repo. + require.NoError(t, os.WriteFile(filepath.Join(packDir, "something.bitmap"), nil, 0o644)) + commitGraphChainPath := filepath.Join(repoPath, stats.CommitGraphChainRelPath) + require.NoError(t, os.MkdirAll(filepath.Dir(commitGraphChainPath), 0o755)) + require.NoError(t, os.WriteFile(commitGraphChainPath, nil, 0o644)) + + // We first create a single big packfile which is used to determine the + // boundary of when we repack. + bigPackfile, err := os.OpenFile(filepath.Join(packDir, "big.pack"), os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644) + require.NoError(t, err) + defer testhelper.MustClose(t, bigPackfile) + _, err = io.Copy(bigPackfile, io.LimitReader(infiniteReader{}, tc.packfileSize)) + require.NoError(t, err) + + // And then we create one less packfile than we need to hit the boundary. + // This is done to assert that we indeed don't repack before hitting the + // boundary. + for i := 0; i < tc.requiredPackfiles-1; i++ { + additionalPackfile, err := os.Create(filepath.Join(packDir, fmt.Sprintf("%d.pack", i))) + require.NoError(t, err) + testhelper.MustClose(t, additionalPackfile) + } + + repackNeeded, _, err := needsRepacking(repo) + require.NoError(t, err) + require.False(t, repackNeeded) + + // Now we create the additional packfile that causes us to hit the boundary. + // We should thus see that we want to repack now. + lastPackfile, err := os.Create(filepath.Join(packDir, "last.pack")) + require.NoError(t, err) + testhelper.MustClose(t, lastPackfile) + + repackNeeded, repackCfg, err := needsRepacking(repo) + require.NoError(t, err) + require.True(t, repackNeeded) + require.Equal(t, repackCommandConfig{ + fullRepack: true, + writeBitmap: true, + }, repackCfg) + }) + } + + for _, tc := range []struct { + desc string + looseObjects []string + expectedRepack bool + }{ + { + desc: "no objects", + looseObjects: nil, + expectedRepack: false, + }, + { + desc: "object not in 17 shard", + looseObjects: []string{ + filepath.Join("ab/12345"), + }, + expectedRepack: false, + }, + { + desc: "object in 17 shard", + looseObjects: []string{ + filepath.Join("17/12345"), + }, + expectedRepack: false, + }, + { + desc: "objects in different shards", + looseObjects: []string{ + filepath.Join("ab/12345"), + filepath.Join("cd/12345"), + filepath.Join("12/12345"), + filepath.Join("17/12345"), + }, + expectedRepack: false, + }, + { + desc: "boundary", + looseObjects: []string{ + filepath.Join("17/1"), + filepath.Join("17/2"), + filepath.Join("17/3"), + filepath.Join("17/4"), + }, + expectedRepack: false, + }, + { + desc: "exceeding boundary should cause repack", + looseObjects: []string{ + filepath.Join("17/1"), + filepath.Join("17/2"), + filepath.Join("17/3"), + filepath.Join("17/4"), + filepath.Join("17/5"), + }, + expectedRepack: true, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + // Emulate the existence of a bitmap and a commit-graph with bloom filters. + // We explicitly don't want to generate them via Git commands as they would + // require us to already have objects in the repository, and we want to be + // in full control over all objects and packfiles in the repo. + require.NoError(t, os.WriteFile(filepath.Join(repoPath, "objects", "pack", "something.bitmap"), nil, 0o644)) + commitGraphChainPath := filepath.Join(repoPath, stats.CommitGraphChainRelPath) + require.NoError(t, os.MkdirAll(filepath.Dir(commitGraphChainPath), 0o755)) + require.NoError(t, os.WriteFile(commitGraphChainPath, nil, 0o644)) + + for _, looseObjectPath := range tc.looseObjects { + looseObjectPath := filepath.Join(repoPath, "objects", looseObjectPath) + require.NoError(t, os.MkdirAll(filepath.Dir(looseObjectPath), 0o755)) + + looseObjectFile, err := os.Create(looseObjectPath) + require.NoError(t, err) + testhelper.MustClose(t, looseObjectFile) + } + + repackNeeded, repackCfg, err := needsRepacking(repo) + require.NoError(t, err) + require.Equal(t, tc.expectedRepack, repackNeeded) + if tc.expectedRepack { + require.Equal(t, repackCommandConfig{ + fullRepack: false, + writeBitmap: false, + }, repackCfg) + } + }) + } +} + +func TestPruneIfNeeded(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + cfg, _ := setupRepositoryServiceWithoutRepo(t) + + for _, tc := range []struct { + desc string + isPool bool + looseObjects []string + expectedPrune bool + }{ + { + desc: "no objects", + looseObjects: nil, + expectedPrune: false, + }, + { + desc: "object not in 17 shard", + looseObjects: []string{ + filepath.Join("ab/12345"), + }, + expectedPrune: false, + }, + { + desc: "object in 17 shard", + looseObjects: []string{ + filepath.Join("17/12345"), + }, + expectedPrune: false, + }, + { + desc: "objects in different shards", + looseObjects: []string{ + filepath.Join("ab/12345"), + filepath.Join("cd/12345"), + filepath.Join("12/12345"), + filepath.Join("17/12345"), + }, + expectedPrune: false, + }, + { + desc: "boundary", + looseObjects: []string{ + filepath.Join("17/1"), + filepath.Join("17/2"), + filepath.Join("17/3"), + filepath.Join("17/4"), + }, + expectedPrune: false, + }, + { + desc: "exceeding boundary should cause repack", + looseObjects: []string{ + filepath.Join("17/1"), + filepath.Join("17/2"), + filepath.Join("17/3"), + filepath.Join("17/4"), + filepath.Join("17/5"), + }, + expectedPrune: true, + }, + { + desc: "exceeding boundary on pool", + isPool: true, + looseObjects: []string{ + filepath.Join("17/1"), + filepath.Join("17/2"), + filepath.Join("17/3"), + filepath.Join("17/4"), + filepath.Join("17/5"), + }, + expectedPrune: false, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + relativePath := gittest.NewRepositoryName(t, true) + if tc.isPool { + relativePath = gittest.NewObjectPoolName(t) + } + + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + RelativePath: relativePath, + }) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + for _, looseObjectPath := range tc.looseObjects { + looseObjectPath := filepath.Join(repoPath, "objects", looseObjectPath) + require.NoError(t, os.MkdirAll(filepath.Dir(looseObjectPath), 0o755)) + + looseObjectFile, err := os.Create(looseObjectPath) + require.NoError(t, err) + testhelper.MustClose(t, looseObjectFile) + } + + didPrune, err := pruneIfNeeded(ctx, repo) + require.Equal(t, tc.expectedPrune, didPrune) + require.NoError(t, err) + }) + } +} + +func TestPackRefsIfNeeded(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + cfg, _ := setupRepositoryServiceWithoutRepo(t) + + const kiloByte = 1024 + + for _, tc := range []struct { + packedRefsSize int64 + requiredRefs int + }{ + { + packedRefsSize: 1, + requiredRefs: 16, + }, + { + packedRefsSize: 1 * kiloByte, + requiredRefs: 16, + }, + { + packedRefsSize: 10 * kiloByte, + requiredRefs: 33, + }, + { + packedRefsSize: 100 * kiloByte, + requiredRefs: 49, + }, + { + packedRefsSize: 1000 * kiloByte, + requiredRefs: 66, + }, + { + packedRefsSize: 10000 * kiloByte, + requiredRefs: 82, + }, + { + packedRefsSize: 100000 * kiloByte, + requiredRefs: 99, + }, + } { + t.Run(fmt.Sprintf("packed-refs with %d bytes", tc.packedRefsSize), func(t *testing.T) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + // Write an empty commit such that we can create valid refs. + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents()) + looseRefContent := []byte(commitID.String() + "\n") + + // We first create a single big packfile which is used to determine the + // boundary of when we repack. We need to write a valid packed-refs file or + // otherwise git-pack-refs(1) would choke later on, so we just write the + // file such that every line is a separate ref of exactly 128 bytes in + // length (a divisor of 1024), referring to the commit we created above. + packedRefs, err := os.OpenFile(filepath.Join(repoPath, "packed-refs"), os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644) + require.NoError(t, err) + defer testhelper.MustClose(t, packedRefs) + for i := int64(0); i < tc.packedRefsSize/128; i++ { + packedRefLine := fmt.Sprintf("%s refs/something/this-line-is-padded-to-exactly-128-bytes-%030d\n", commitID.String(), i) + require.Len(t, packedRefLine, 128) + _, err := packedRefs.WriteString(packedRefLine) + require.NoError(t, err) + } + require.NoError(t, packedRefs.Sync()) + + // And then we create one less loose ref than we need to hit the boundary. + // This is done to assert that we indeed don't repack before hitting the + // boundary. + for i := 0; i < tc.requiredRefs-1; i++ { + looseRefPath := filepath.Join(repoPath, "refs", "heads", fmt.Sprintf("branch-%d", i)) + require.NoError(t, os.WriteFile(looseRefPath, looseRefContent, 0o644)) + } + + didRepack, err := packRefsIfNeeded(ctx, repo) + require.NoError(t, err) + require.False(t, didRepack) + + // Now we create the additional loose ref that causes us to hit the + // boundary. We should thus see that we want to repack now. + looseRefPath := filepath.Join(repoPath, "refs", "heads", "last-branch") + require.NoError(t, os.WriteFile(looseRefPath, looseRefContent, 0o644)) + + didRepack, err = packRefsIfNeeded(ctx, repo) + require.NoError(t, err) + require.True(t, didRepack) + }) + } +} + +func TestEstimateLooseObjectCount(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + cfg, _ := setupRepositoryServiceWithoutRepo(t) + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + t.Run("empty repository", func(t *testing.T) { + looseObjects, err := estimateLooseObjectCount(repo) + require.NoError(t, err) + require.Zero(t, looseObjects) + }) + + t.Run("object in different shard", func(t *testing.T) { + differentShard := filepath.Join(repoPath, "objects", "a0") + require.NoError(t, os.MkdirAll(differentShard, 0o755)) + + object, err := os.Create(filepath.Join(differentShard, "123456")) + require.NoError(t, err) + testhelper.MustClose(t, object) + + looseObjects, err := estimateLooseObjectCount(repo) + require.NoError(t, err) + require.Zero(t, looseObjects) + }) + + t.Run("object in estimation shard", func(t *testing.T) { + estimationShard := filepath.Join(repoPath, "objects", "17") + require.NoError(t, os.MkdirAll(estimationShard, 0o755)) + + object, err := os.Create(filepath.Join(estimationShard, "123456")) + require.NoError(t, err) + testhelper.MustClose(t, object) + + looseObjects, err := estimateLooseObjectCount(repo) + require.NoError(t, err) + require.Equal(t, int64(256), looseObjects) + + // Create a second object in there. + object, err = os.Create(filepath.Join(estimationShard, "654321")) + require.NoError(t, err) + testhelper.MustClose(t, object) + + looseObjects, err = estimateLooseObjectCount(repo) + require.NoError(t, err) + require.Equal(t, int64(512), looseObjects) + }) +} diff --git a/internal/gitaly/service/repository/repack.go b/internal/gitaly/service/repository/repack.go index 280fb6186..a3f428650 100644 --- a/internal/gitaly/service/repository/repack.go +++ b/internal/gitaly/service/repository/repack.go @@ -36,31 +36,52 @@ func log2Threads(numCPUs int) git.ValueFlag { func (s *server) RepackFull(ctx context.Context, in *gitalypb.RepackFullRequest) (*gitalypb.RepackFullResponse, error) { repo := s.localrepo(in.GetRepository()) - options := []git.Option{ - git.Flag{Name: "-A"}, - git.Flag{Name: "--pack-kept-objects"}, - git.Flag{Name: "-l"}, - log2Threads(runtime.NumCPU()), + cfg := repackCommandConfig{ + fullRepack: true, + writeBitmap: in.GetCreateBitmap(), } - if err := repack(ctx, repo, in.GetCreateBitmap(), options...); err != nil { + + if err := repack(ctx, repo, cfg); err != nil { return nil, helper.ErrInternal(err) } + return &gitalypb.RepackFullResponse{}, nil } func (s *server) RepackIncremental(ctx context.Context, in *gitalypb.RepackIncrementalRequest) (*gitalypb.RepackIncrementalResponse, error) { repo := s.localrepo(in.GetRepository()) - if err := repack(ctx, repo, false); err != nil { - return nil, helper.ErrInternal(err) + cfg := repackCommandConfig{ + fullRepack: false, + writeBitmap: false, + } + + if err := repack(ctx, repo, cfg); err != nil { + return nil, err } + return &gitalypb.RepackIncrementalResponse{}, nil } -func repack(ctx context.Context, repo *localrepo.Repo, bitmap bool, args ...git.Option) error { +type repackCommandConfig struct { + fullRepack bool + writeBitmap bool +} + +func repack(ctx context.Context, repo *localrepo.Repo, cfg repackCommandConfig) error { + var options []git.Option + if cfg.fullRepack { + options = append(options, + git.Flag{Name: "-A"}, + git.Flag{Name: "--pack-kept-objects"}, + git.Flag{Name: "-l"}, + log2Threads(runtime.NumCPU()), + ) + } + if err := repo.ExecAndWait(ctx, git.SubCmd{ Name: "repack", - Flags: append([]git.Option{git.Flag{Name: "-d"}}, args...), - }, git.WithConfig(repackConfig(ctx, bitmap)...)); err != nil { + Flags: append([]git.Option{git.Flag{Name: "-d"}}, options...), + }, git.WithConfig(repackConfig(ctx, cfg.writeBitmap)...)); err != nil { return err } |