Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorToon Claes <toon@gitlab.com>2022-02-10 12:22:20 +0300
committerToon Claes <toon@gitlab.com>2022-02-10 12:22:20 +0300
commit3074a5673df93a6a765c2bdde84bd4f6f670afcb (patch)
tree000ff6b969004853218c94b6a416134db73ccc26
parentd9e05a74f672aa10f5e3449d4bae433995974471 (diff)
parent9d5181b803f5f4720ded8e070db13d6c1fe73123 (diff)
Merge branch 'pks-optimize-repository-heuristic-optimizations' into 'master'
Extend heuristic optimizations in OptimizeRepository Closes #2296 See merge request gitlab-org/gitaly!4332
-rw-r--r--internal/git/command_description.go3
-rw-r--r--internal/gitaly/service/repository/cleanup.go35
-rw-r--r--internal/gitaly/service/repository/cleanup_test.go29
-rw-r--r--internal/gitaly/service/repository/optimize.go404
-rw-r--r--internal/gitaly/service/repository/optimize_test.go557
-rw-r--r--internal/gitaly/service/repository/repack.go43
6 files changed, 1021 insertions, 50 deletions
diff --git a/internal/git/command_description.go b/internal/git/command_description.go
index d4d5ec82d..a1a33c037 100644
--- a/internal/git/command_description.go
+++ b/internal/git/command_description.go
@@ -176,6 +176,9 @@ var commandDescriptions = map[string]commandDescription{
"pack-objects": {
flags: scNoRefUpdates | scGeneratesPackfiles,
},
+ "prune": {
+ flags: scNoRefUpdates,
+ },
"push": {
flags: scNoRefUpdates,
opts: []GlobalOption{
diff --git a/internal/gitaly/service/repository/cleanup.go b/internal/gitaly/service/repository/cleanup.go
index 3bb71faa0..8755c053d 100644
--- a/internal/gitaly/service/repository/cleanup.go
+++ b/internal/gitaly/service/repository/cleanup.go
@@ -127,6 +127,41 @@ func isExitWithCode(err error, code int) bool {
}
func cleanDisconnectedWorktrees(ctx context.Context, repo *localrepo.Repo) error {
+ repoPath, err := repo.Path()
+ if err != nil {
+ return err
+ }
+
+ // Spawning a command is expensive. We thus try to avoid the overhead by first
+ // determining if there could possibly be any work to be done by git-worktree(1). We do so
+ // by reading the directory in which worktrees are stored, and if it's empty then we know
+ // that there aren't any worktrees in the first place.
+ worktreeEntries, err := os.ReadDir(filepath.Join(repoPath, "worktrees"))
+ if err != nil {
+ if errors.Is(err, os.ErrNotExist) {
+ return nil
+ }
+ }
+
+ hasWorktrees := false
+ for _, worktreeEntry := range worktreeEntries {
+ if !worktreeEntry.IsDir() {
+ continue
+ }
+
+ if worktreeEntry.Name() == "." || worktreeEntry.Name() == ".." {
+ continue
+ }
+
+ hasWorktrees = true
+ break
+ }
+
+ // There are no worktrees, so let's avoid spawning the Git command.
+ if !hasWorktrees {
+ return nil
+ }
+
return repo.ExecAndWait(ctx, git.SubSubCmd{
Name: "worktree",
Action: "prune",
diff --git a/internal/gitaly/service/repository/cleanup_test.go b/internal/gitaly/service/repository/cleanup_test.go
index 215a57b11..9975a22ab 100644
--- a/internal/gitaly/service/repository/cleanup_test.go
+++ b/internal/gitaly/service/repository/cleanup_test.go
@@ -8,8 +8,10 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
+ "gitlab.com/gitlab-org/gitaly/v14/internal/git"
"gitlab.com/gitlab-org/gitaly/v14/internal/git/gittest"
"gitlab.com/gitlab-org/gitaly/v14/internal/git/localrepo"
+ "gitlab.com/gitlab-org/gitaly/v14/internal/gitaly/config"
"gitlab.com/gitlab-org/gitaly/v14/internal/testhelper"
"gitlab.com/gitlab-org/gitaly/v14/internal/testhelper/testcfg"
"gitlab.com/gitlab-org/gitaly/v14/proto/go/gitalypb"
@@ -141,6 +143,33 @@ func TestCleanupDisconnectedWorktrees(t *testing.T) {
gittest.AddWorktree(t, cfg, repoPath, worktreePath)
}
+func TestCleanupDisconnectedWorktrees_doesNothingWithoutWorktrees(t *testing.T) {
+ t.Parallel()
+
+ ctx := testhelper.Context(t)
+ cfg, repoProto, repoPath, _ := setupRepositoryService(ctx, t)
+ worktreePath := filepath.Join(testhelper.TempDir(t), "worktree")
+
+ failingGitCmdFactory := gittest.NewInterceptingCommandFactory(ctx, t, cfg, func(git.ExecutionEnvironment) string {
+ return `#!/usr/bin/env bash
+ exit 15
+ `
+ })
+
+ repo := localrepo.New(config.NewLocator(cfg), failingGitCmdFactory, nil, repoProto)
+
+ // If this command did spawn git-worktree(1) we'd see an error. It doesn't though because it
+ // detects that there aren't any worktrees at all.
+ require.NoError(t, cleanDisconnectedWorktrees(ctx, repo))
+
+ gittest.AddWorktree(t, cfg, repoPath, worktreePath)
+
+ // We have now added a worktree now, so it should detect that there are worktrees and thus
+ // spawn the Git command. We thus expect the error code we inject via the failing Git
+ // command factory.
+ require.EqualError(t, cleanDisconnectedWorktrees(ctx, repo), "exit status 15")
+}
+
func TestRemoveWorktree(t *testing.T) {
t.Parallel()
diff --git a/internal/gitaly/service/repository/optimize.go b/internal/gitaly/service/repository/optimize.go
index 4bfbfb386..b13566036 100644
--- a/internal/gitaly/service/repository/optimize.go
+++ b/internal/gitaly/service/repository/optimize.go
@@ -1,10 +1,18 @@
package repository
import (
+ "bytes"
"context"
+ "errors"
"fmt"
+ "io/fs"
+ "math"
"os"
+ "path/filepath"
+ "strings"
+ "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus"
+ "gitlab.com/gitlab-org/gitaly/v14/internal/git"
"gitlab.com/gitlab-org/gitaly/v14/internal/git/housekeeping"
"gitlab.com/gitlab-org/gitaly/v14/internal/git/localrepo"
"gitlab.com/gitlab-org/gitaly/v14/internal/git/stats"
@@ -12,86 +20,404 @@ import (
"gitlab.com/gitlab-org/gitaly/v14/proto/go/gitalypb"
)
+const (
+ // looseObjectLimit is the limit of loose objects we accept both when doing incremental
+ // repacks and when pruning objects.
+ looseObjectLimit = 1024
+)
+
+func (s *server) OptimizeRepository(ctx context.Context, in *gitalypb.OptimizeRepositoryRequest) (*gitalypb.OptimizeRepositoryResponse, error) {
+ if err := s.validateOptimizeRepositoryRequest(in); err != nil {
+ return nil, err
+ }
+
+ repo := s.localrepo(in.GetRepository())
+
+ if err := s.optimizeRepository(ctx, repo); err != nil {
+ return nil, helper.ErrInternal(err)
+ }
+
+ return &gitalypb.OptimizeRepositoryResponse{}, nil
+}
+
+func (s *server) validateOptimizeRepositoryRequest(in *gitalypb.OptimizeRepositoryRequest) error {
+ if in.GetRepository() == nil {
+ return helper.ErrInvalidArgumentf("empty repository")
+ }
+
+ _, err := s.locator.GetRepoPath(in.GetRepository())
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (s *server) optimizeRepository(ctx context.Context, repo *localrepo.Repo) error {
+ optimizations := struct {
+ PackedObjects bool `json:"packed_objects"`
+ PrunedObjects bool `json:"pruned_objects"`
+ PackedRefs bool `json:"packed_refs"`
+ }{}
+ defer func() {
+ ctxlogrus.Extract(ctx).WithField("optimizations", optimizations).Info("optimized repository")
+ }()
+
+ if err := housekeeping.Perform(ctx, repo, s.txManager); err != nil {
+ return fmt.Errorf("could not execute houskeeping: %w", err)
+ }
+
+ if err := cleanupWorktrees(ctx, repo); err != nil {
+ return fmt.Errorf("could not clean up worktrees: %w", err)
+ }
+
+ didRepack, err := repackIfNeeded(ctx, repo)
+ if err != nil {
+ return fmt.Errorf("could not repack: %w", err)
+ }
+ optimizations.PackedObjects = didRepack
+
+ didPrune, err := pruneIfNeeded(ctx, repo)
+ if err != nil {
+ return fmt.Errorf("could not prune: %w", err)
+ }
+ optimizations.PrunedObjects = didPrune
+
+ didPackRefs, err := packRefsIfNeeded(ctx, repo)
+ if err != nil {
+ return fmt.Errorf("could not pack refs: %w", err)
+ }
+ optimizations.PackedRefs = didPackRefs
+
+ return nil
+}
+
// repackIfNeeded uses a set of heuristics to determine whether the repository needs a
// full repack and, if so, repacks it.
-func (s *server) repackIfNeeded(ctx context.Context, repo *localrepo.Repo, repoProto *gitalypb.Repository) error {
+func repackIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) {
+ repackNeeded, cfg, err := needsRepacking(repo)
+ if err != nil {
+ return false, fmt.Errorf("determining whether repo needs repack: %w", err)
+ }
+
+ if !repackNeeded {
+ return false, nil
+ }
+
+ if err := repack(ctx, repo, cfg); err != nil {
+ return false, err
+ }
+
+ return true, nil
+}
+
+func needsRepacking(repo *localrepo.Repo) (bool, repackCommandConfig, error) {
repoPath, err := repo.Path()
if err != nil {
- return err
+ return false, repackCommandConfig{}, fmt.Errorf("getting repository path: %w", err)
+ }
+
+ altFile, err := repo.InfoAlternatesPath()
+ if err != nil {
+ return false, repackCommandConfig{}, helper.ErrInternal(err)
+ }
+
+ hasAlternate := true
+ if _, err := os.Stat(altFile); os.IsNotExist(err) {
+ hasAlternate = false
}
hasBitmap, err := stats.HasBitmap(repoPath)
if err != nil {
- return helper.ErrInternal(err)
+ return false, repackCommandConfig{}, fmt.Errorf("checking for bitmap: %w", err)
+ }
+
+ // Bitmaps are used to efficiently determine transitive reachability of objects from a
+ // set of commits. They are an essential part of the puzzle required to serve fetches
+ // efficiently, as we'd otherwise need to traverse the object graph every time to find
+ // which objects we have to send. We thus repack the repository with bitmaps enabled in
+ // case they're missing.
+ //
+ // There is one exception: repositories which are connected to an object pool must not have
+ // a bitmap on their own. We do not yet use multi-pack indices, and in that case Git can
+ // only use one bitmap. We already generate this bitmap in the pool, so member of it
+ // shouldn't have another bitmap on their own.
+ if !hasBitmap && !hasAlternate {
+ return true, repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: true,
+ }, nil
}
missingBloomFilters, err := stats.IsMissingBloomFilters(repoPath)
if err != nil {
- return helper.ErrInternal(err)
+ return false, repackCommandConfig{}, fmt.Errorf("checking for bloom filters: %w", err)
}
- if hasBitmap && !missingBloomFilters {
- return nil
+ // Bloom filters are part of the commit-graph and allow us to efficiently determine which
+ // paths have been modified in a given commit without having to look into the object
+ // database. In the past we didn't compute bloom filters at all, so we want to rewrite the
+ // whole commit-graph to generate them.
+ //
+ // Note that we'll eventually want to move out commit-graph generation from repacking. When
+ // that happens we should update the commit-graph either if it's missing, when bloom filters
+ // are missing or when packfiles have been updated.
+ if missingBloomFilters {
+ return true, repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: !hasAlternate,
+ }, nil
}
- altFile, err := repo.InfoAlternatesPath()
+ largestPackfileSize, packfileCount, err := packfileSizeAndCount(repo)
if err != nil {
- return helper.ErrInternal(err)
+ return false, repackCommandConfig{}, fmt.Errorf("checking largest packfile size: %w", err)
}
- // Repositories with alternates should never have a bitmap, as Git will otherwise complain about
- // multiple bitmaps being present in parent and alternate repository.
- // In case of an error it still tries it is best to optimise the repository.
- createBitMap := false
- if _, err := os.Stat(altFile); os.IsNotExist(err) {
- createBitMap = true
+ // Whenever we do an incremental repack we create a new packfile, and as a result Git may
+ // have to look into every one of the packfiles to find objects. This is less efficient the
+ // more packfiles we have, but we cannot repack the whole repository every time either given
+ // that this may take a lot of time.
+ //
+ // Instead, we determine whether the repository has "too many" packfiles. "Too many" is
+ // relative though: for small repositories it's fine to do full repacks regularly, but for
+ // large repositories we need to be more careful. We thus use a heuristic of "repository
+ // largeness": we take the biggest packfile that exists, and then the maximum allowed number
+ // of packfiles is `log(largestpackfile_size_in_mb) / log(1.3)`. This gives the following
+ // allowed number of packfiles:
+ //
+ // - No packfile: 5 packfile. This is a special case.
+ // - 10MB packfile: 8 packfiles.
+ // - 100MB packfile: 17 packfiles.
+ // - 500MB packfile: 23 packfiles.
+ // - 1GB packfile: 26 packfiles.
+ // - 5GB packfile: 32 packfiles.
+ // - 10GB packfile: 35 packfiles.
+ // - 100GB packfile: 43 packfiles.
+ //
+ // The goal is to have a comparatively quick ramp-up of allowed packfiles as the repository
+ // size grows, but then slow down such that we're effectively capped and don't end up with
+ // an excessive amount of packfiles.
+ //
+ // This is a heuristic and thus imperfect by necessity. We may tune it as we gain experience
+ // with the way it behaves.
+ if int64(math.Max(5, math.Log(float64(largestPackfileSize))/math.Log(1.3))) < packfileCount {
+ return true, repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: !hasAlternate,
+ }, nil
}
- if _, err = s.RepackFull(ctx, &gitalypb.RepackFullRequest{
- Repository: repoProto,
- CreateBitmap: createBitMap,
- }); err != nil {
- return err
+ looseObjectCount, err := estimateLooseObjectCount(repo)
+ if err != nil {
+ return false, repackCommandConfig{}, fmt.Errorf("estimating loose object count: %w", err)
}
- return nil
+ // Most Git commands do not write packfiles directly, but instead write loose objects into
+ // the object database. So while we now know that there ain't too many packfiles, we still
+ // need to check whether we have too many objects.
+ //
+ // In this case it doesn't make a lot of sense to scale incremental repacks with the repo's
+ // size: we only pack loose objects, so the time to pack them doesn't scale with repository
+ // size but with the number of loose objects we have. git-gc(1) uses a threshold of 6700
+ // loose objects to start an incremental repack, but one needs to keep in mind that Git
+ // typically has defaults which are better suited for the client-side instead of the
+ // server-side in most commands.
+ //
+ // In our case we typically want to ensure that our repositories are much better packed than
+ // it is necessary on the client side. We thus take a much stricter limit of 1024 objects.
+ if looseObjectCount > looseObjectLimit {
+ return true, repackCommandConfig{
+ fullRepack: false,
+ writeBitmap: false,
+ }, nil
+ }
+
+ return false, repackCommandConfig{}, nil
}
-func (s *server) optimizeRepository(ctx context.Context, repository *gitalypb.Repository) error {
- repo := s.localrepo(repository)
+func packfileSizeAndCount(repo *localrepo.Repo) (int64, int64, error) {
+ repoPath, err := repo.Path()
+ if err != nil {
+ return 0, 0, fmt.Errorf("getting repository path: %w", err)
+ }
- if err := s.repackIfNeeded(ctx, repo, repository); err != nil {
- return fmt.Errorf("could not repack: %w", err)
+ entries, err := os.ReadDir(filepath.Join(repoPath, "objects/pack"))
+ if err != nil {
+ if errors.Is(err, os.ErrNotExist) {
+ return 0, 0, nil
+ }
+
+ return 0, 0, err
}
- if err := housekeeping.Perform(ctx, repo, s.txManager); err != nil {
- return fmt.Errorf("could not execute houskeeping: %w", err)
+ largestSize := int64(0)
+ count := int64(0)
+
+ for _, entry := range entries {
+ if !strings.HasSuffix(entry.Name(), ".pack") {
+ continue
+ }
+
+ entryInfo, err := entry.Info()
+ if err != nil {
+ if errors.Is(err, os.ErrNotExist) {
+ continue
+ }
+
+ return 0, 0, fmt.Errorf("getting packfile info: %w", err)
+ }
+
+ if entryInfo.Size() > largestSize {
+ largestSize = entryInfo.Size()
+ }
+
+ count++
}
- return nil
+ return largestSize / 1024 / 1024, count, nil
}
-func (s *server) OptimizeRepository(ctx context.Context, in *gitalypb.OptimizeRepositoryRequest) (*gitalypb.OptimizeRepositoryResponse, error) {
- if err := s.validateOptimizeRepositoryRequest(in); err != nil {
- return nil, err
+// estimateLooseObjectCount estimates the number of loose objects in the repository. Due to the
+// object name being derived via a cryptographic hash we know that in the general case, objects are
+// evenly distributed across their sharding directories. We can thus estimate the number of loose
+// objects by opening a single sharding directory and counting its entries.
+func estimateLooseObjectCount(repo *localrepo.Repo) (int64, error) {
+ repoPath, err := repo.Path()
+ if err != nil {
+ return 0, fmt.Errorf("getting repository path: %w", err)
}
- if err := s.optimizeRepository(ctx, in.GetRepository()); err != nil {
- return nil, helper.ErrInternal(err)
+ // We use the same sharding directory as git-gc(1) does for its estimation.
+ entries, err := os.ReadDir(filepath.Join(repoPath, "objects/17"))
+ if err != nil {
+ if errors.Is(err, os.ErrNotExist) {
+ return 0, nil
+ }
+
+ return 0, fmt.Errorf("reading loose object shard: %w", err)
}
- return &gitalypb.OptimizeRepositoryResponse{}, nil
+ looseObjects := int64(0)
+ for _, entry := range entries {
+ if strings.LastIndexAny(entry.Name(), "0123456789abcdef") != len(entry.Name())-1 {
+ continue
+ }
+
+ looseObjects++
+ }
+
+ // Scale up found loose objects by the number of sharding directories.
+ return looseObjects * 256, nil
}
-func (s *server) validateOptimizeRepositoryRequest(in *gitalypb.OptimizeRepositoryRequest) error {
- if in.GetRepository() == nil {
- return helper.ErrInvalidArgumentf("empty repository")
+// pruneIfNeeded removes objects from the repository which are either unreachable or which are
+// already part of a packfile. We use a grace period of two weeks.
+func pruneIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) {
+ // Pool repositories must never prune any objects, or otherwise we may corrupt members of
+ // that pool if they still refer to that object.
+ if strings.HasPrefix(repo.GetRelativePath(), "@pools") {
+ return false, nil
}
- _, err := s.locator.GetRepoPath(in.GetRepository())
+ looseObjectCount, err := estimateLooseObjectCount(repo)
if err != nil {
- return err
+ return false, fmt.Errorf("estimating loose object count: %w", err)
}
- return nil
+ // We again use the same limit here as we do when doing an incremental repack. This is done
+ // intentionally: if we determine that there's too many loose objects and try to repack, but
+ // all of those loose objects are in fact unreachable, then we'd still have the same number
+ // of unreachable objects after the incremental repack. We'd thus try to repack every single
+ // time.
+ //
+ // Using the same limit here doesn't quite fix this case: the unreachable objects would only
+ // be pruned after a grace period of two weeks. But at least we know that we will eventually
+ // prune up those unreachable objects, at which point we won't try to do another incremental
+ // repack.
+ if looseObjectCount <= looseObjectLimit {
+ return false, nil
+ }
+
+ if err := repo.ExecAndWait(ctx, git.SubCmd{
+ Name: "prune",
+ }); err != nil {
+ return false, fmt.Errorf("pruning objects: %w", err)
+ }
+
+ return true, nil
+}
+
+func packRefsIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) {
+ repoPath, err := repo.Path()
+ if err != nil {
+ return false, fmt.Errorf("getting repository path: %w", err)
+ }
+ refsPath := filepath.Join(repoPath, "refs")
+
+ looseRefs := int64(0)
+ if err := filepath.WalkDir(refsPath, func(path string, entry fs.DirEntry, err error) error {
+ if err != nil {
+ return err
+ }
+
+ if !entry.IsDir() {
+ looseRefs++
+ }
+
+ return nil
+ }); err != nil {
+ return false, fmt.Errorf("counting loose refs: %w", err)
+ }
+
+ // If there aren't any loose refs then there is nothing we need to do.
+ if looseRefs == 0 {
+ return false, nil
+ }
+
+ packedRefsSize := int64(0)
+ if stat, err := os.Stat(filepath.Join(repoPath, "packed-refs")); err != nil {
+ if !errors.Is(err, os.ErrNotExist) {
+ return false, fmt.Errorf("getting packed-refs size: %w", err)
+ }
+ } else {
+ packedRefsSize = stat.Size()
+ }
+
+ // Packing loose references into the packed-refs file scales with the number of references
+ // we're about to write. We thus decide whether we repack refs by weighing the current size
+ // of the packed-refs file against the number of loose references. This is done such that we
+ // do not repack too often on repositories with a huge number of references, where we can
+ // expect a lot of churn in the number of references.
+ //
+ // As a heuristic, we repack if the number of loose references in the repository exceeds
+ // `log(packed_refs_size_in_bytes/100)/log(1.15)`, which scales as following (number of refs
+ // is estimated with 100 bytes per reference):
+ //
+ // - 1kB ~ 10 packed refs: 16 refs
+ // - 10kB ~ 100 packed refs: 33 refs
+ // - 100kB ~ 1k packed refs: 49 refs
+ // - 1MB ~ 10k packed refs: 66 refs
+ // - 10MB ~ 100k packed refs: 82 refs
+ // - 100MB ~ 1m packed refs: 99 refs
+ //
+ // We thus allow roughly 16 additional loose refs per factor of ten of packed refs.
+ //
+ // This heuristic may likely need tweaking in the future, but should serve as a good first
+ // iteration.
+ if int64(math.Max(16, math.Log(float64(packedRefsSize)/100)/math.Log(1.15))) > looseRefs {
+ return false, nil
+ }
+
+ var stderr bytes.Buffer
+ if err := repo.ExecAndWait(ctx, git.SubCmd{
+ Name: "pack-refs",
+ Flags: []git.Option{
+ git.Flag{Name: "--all"},
+ },
+ }, git.WithStderr(&stderr)); err != nil {
+ return false, fmt.Errorf("packing refs: %w, stderr: %q", err, stderr.String())
+ }
+
+ return true, nil
}
diff --git a/internal/gitaly/service/repository/optimize_test.go b/internal/gitaly/service/repository/optimize_test.go
index c35ffa924..f7e4152e0 100644
--- a/internal/gitaly/service/repository/optimize_test.go
+++ b/internal/gitaly/service/repository/optimize_test.go
@@ -2,6 +2,8 @@ package repository
import (
"bytes"
+ "fmt"
+ "io"
"os"
"path/filepath"
"testing"
@@ -10,6 +12,7 @@ import (
"github.com/stretchr/testify/require"
"gitlab.com/gitlab-org/gitaly/v14/internal/git"
"gitlab.com/gitlab-org/gitaly/v14/internal/git/gittest"
+ "gitlab.com/gitlab-org/gitaly/v14/internal/git/localrepo"
"gitlab.com/gitlab-org/gitaly/v14/internal/git/stats"
"gitlab.com/gitlab-org/gitaly/v14/internal/testhelper"
"gitlab.com/gitlab-org/gitaly/v14/proto/go/gitalypb"
@@ -186,3 +189,557 @@ func TestOptimizeRepositoryValidation(t *testing.T) {
_, err := client.OptimizeRepository(ctx, &gitalypb.OptimizeRepositoryRequest{Repository: repo})
require.NoError(t, err)
}
+
+type infiniteReader struct{}
+
+func (r infiniteReader) Read(b []byte) (int, error) {
+ for i := range b {
+ b[i] = '\000'
+ }
+ return len(b), nil
+}
+
+func TestNeedsRepacking(t *testing.T) {
+ t.Parallel()
+
+ ctx := testhelper.Context(t)
+ cfg, _ := setupRepositoryServiceWithoutRepo(t)
+
+ for _, tc := range []struct {
+ desc string
+ setup func(t *testing.T) *gitalypb.Repository
+ expectedErr error
+ expectedNeeded bool
+ expectedConfig repackCommandConfig
+ }{
+ {
+ desc: "empty repo",
+ setup: func(t *testing.T) *gitalypb.Repository {
+ repoProto, _ := gittest.CreateRepository(ctx, t, cfg)
+ return repoProto
+ },
+ // This is a bug: if the repo is empty then we wouldn't ever generate a
+ // packfile, but we determine a repack is needed because it's missing a
+ // bitmap. It's a rather benign bug though: git-repack(1) will exit
+ // immediately because it knows that there's nothing to repack.
+ expectedNeeded: true,
+ expectedConfig: repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: true,
+ },
+ },
+ {
+ desc: "missing bitmap",
+ setup: func(t *testing.T) *gitalypb.Repository {
+ repoProto, _ := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ Seed: gittest.SeedGitLabTest,
+ })
+ return repoProto
+ },
+ expectedNeeded: true,
+ expectedConfig: repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: true,
+ },
+ },
+ {
+ desc: "missing bitmap with alternate",
+ setup: func(t *testing.T) *gitalypb.Repository {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ Seed: gittest.SeedGitLabTest,
+ })
+
+ // Create the alternates file. If it exists, then we shouldn't try
+ // to generate a bitmap.
+ require.NoError(t, os.WriteFile(filepath.Join(repoPath, "objects", "info", "alternates"), nil, 0o755))
+
+ return repoProto
+ },
+ expectedNeeded: true,
+ expectedConfig: repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: false,
+ },
+ },
+ {
+ desc: "missing commit-graph",
+ setup: func(t *testing.T) *gitalypb.Repository {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ Seed: gittest.SeedGitLabTest,
+ })
+
+ gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "--write-bitmap-index")
+
+ return repoProto
+ },
+ expectedNeeded: true,
+ expectedConfig: repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: true,
+ },
+ },
+ {
+ desc: "commit-graph without bloom filters",
+ setup: func(t *testing.T) *gitalypb.Repository {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ Seed: gittest.SeedGitLabTest,
+ })
+
+ gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "--write-bitmap-index")
+ gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write")
+
+ return repoProto
+ },
+ expectedNeeded: true,
+ expectedConfig: repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: true,
+ },
+ },
+ {
+ desc: "no repack needed",
+ setup: func(t *testing.T) *gitalypb.Repository {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ Seed: gittest.SeedGitLabTest,
+ })
+
+ gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad", "--write-bitmap-index")
+ gittest.Exec(t, cfg, "-C", repoPath, "commit-graph", "write", "--changed-paths", "--split")
+
+ return repoProto
+ },
+ expectedNeeded: false,
+ },
+ } {
+ tc := tc
+ t.Run(tc.desc, func(t *testing.T) {
+ t.Parallel()
+
+ repoProto := tc.setup(t)
+ repo := localrepo.NewTestRepo(t, cfg, repoProto)
+
+ repackNeeded, repackCfg, err := needsRepacking(repo)
+ require.Equal(t, tc.expectedErr, err)
+ require.Equal(t, tc.expectedNeeded, repackNeeded)
+ require.Equal(t, tc.expectedConfig, repackCfg)
+ })
+ }
+
+ const megaByte = 1024 * 1024
+
+ for _, tc := range []struct {
+ packfileSize int64
+ requiredPackfiles int
+ }{
+ {
+ packfileSize: 1,
+ requiredPackfiles: 5,
+ },
+ {
+ packfileSize: 5 * megaByte,
+ requiredPackfiles: 6,
+ },
+ {
+ packfileSize: 10 * megaByte,
+ requiredPackfiles: 8,
+ },
+ {
+ packfileSize: 50 * megaByte,
+ requiredPackfiles: 14,
+ },
+ {
+ packfileSize: 100 * megaByte,
+ requiredPackfiles: 17,
+ },
+ {
+ packfileSize: 500 * megaByte,
+ requiredPackfiles: 23,
+ },
+ {
+ packfileSize: 1000 * megaByte,
+ requiredPackfiles: 26,
+ },
+ // Let's not go any further than this, we're thrashing the temporary directory.
+ } {
+ t.Run(fmt.Sprintf("packfile with %d bytes", tc.packfileSize), func(t *testing.T) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg)
+ repo := localrepo.NewTestRepo(t, cfg, repoProto)
+ packDir := filepath.Join(repoPath, "objects", "pack")
+
+ // Emulate the existence of a bitmap and a commit-graph with bloom filters.
+ // We explicitly don't want to generate them via Git commands as they would
+ // require us to already have objects in the repository, and we want to be
+ // in full control over all objects and packfiles in the repo.
+ require.NoError(t, os.WriteFile(filepath.Join(packDir, "something.bitmap"), nil, 0o644))
+ commitGraphChainPath := filepath.Join(repoPath, stats.CommitGraphChainRelPath)
+ require.NoError(t, os.MkdirAll(filepath.Dir(commitGraphChainPath), 0o755))
+ require.NoError(t, os.WriteFile(commitGraphChainPath, nil, 0o644))
+
+ // We first create a single big packfile which is used to determine the
+ // boundary of when we repack.
+ bigPackfile, err := os.OpenFile(filepath.Join(packDir, "big.pack"), os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644)
+ require.NoError(t, err)
+ defer testhelper.MustClose(t, bigPackfile)
+ _, err = io.Copy(bigPackfile, io.LimitReader(infiniteReader{}, tc.packfileSize))
+ require.NoError(t, err)
+
+ // And then we create one less packfile than we need to hit the boundary.
+ // This is done to assert that we indeed don't repack before hitting the
+ // boundary.
+ for i := 0; i < tc.requiredPackfiles-1; i++ {
+ additionalPackfile, err := os.Create(filepath.Join(packDir, fmt.Sprintf("%d.pack", i)))
+ require.NoError(t, err)
+ testhelper.MustClose(t, additionalPackfile)
+ }
+
+ repackNeeded, _, err := needsRepacking(repo)
+ require.NoError(t, err)
+ require.False(t, repackNeeded)
+
+ // Now we create the additional packfile that causes us to hit the boundary.
+ // We should thus see that we want to repack now.
+ lastPackfile, err := os.Create(filepath.Join(packDir, "last.pack"))
+ require.NoError(t, err)
+ testhelper.MustClose(t, lastPackfile)
+
+ repackNeeded, repackCfg, err := needsRepacking(repo)
+ require.NoError(t, err)
+ require.True(t, repackNeeded)
+ require.Equal(t, repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: true,
+ }, repackCfg)
+ })
+ }
+
+ for _, tc := range []struct {
+ desc string
+ looseObjects []string
+ expectedRepack bool
+ }{
+ {
+ desc: "no objects",
+ looseObjects: nil,
+ expectedRepack: false,
+ },
+ {
+ desc: "object not in 17 shard",
+ looseObjects: []string{
+ filepath.Join("ab/12345"),
+ },
+ expectedRepack: false,
+ },
+ {
+ desc: "object in 17 shard",
+ looseObjects: []string{
+ filepath.Join("17/12345"),
+ },
+ expectedRepack: false,
+ },
+ {
+ desc: "objects in different shards",
+ looseObjects: []string{
+ filepath.Join("ab/12345"),
+ filepath.Join("cd/12345"),
+ filepath.Join("12/12345"),
+ filepath.Join("17/12345"),
+ },
+ expectedRepack: false,
+ },
+ {
+ desc: "boundary",
+ looseObjects: []string{
+ filepath.Join("17/1"),
+ filepath.Join("17/2"),
+ filepath.Join("17/3"),
+ filepath.Join("17/4"),
+ },
+ expectedRepack: false,
+ },
+ {
+ desc: "exceeding boundary should cause repack",
+ looseObjects: []string{
+ filepath.Join("17/1"),
+ filepath.Join("17/2"),
+ filepath.Join("17/3"),
+ filepath.Join("17/4"),
+ filepath.Join("17/5"),
+ },
+ expectedRepack: true,
+ },
+ } {
+ t.Run(tc.desc, func(t *testing.T) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg)
+ repo := localrepo.NewTestRepo(t, cfg, repoProto)
+
+ // Emulate the existence of a bitmap and a commit-graph with bloom filters.
+ // We explicitly don't want to generate them via Git commands as they would
+ // require us to already have objects in the repository, and we want to be
+ // in full control over all objects and packfiles in the repo.
+ require.NoError(t, os.WriteFile(filepath.Join(repoPath, "objects", "pack", "something.bitmap"), nil, 0o644))
+ commitGraphChainPath := filepath.Join(repoPath, stats.CommitGraphChainRelPath)
+ require.NoError(t, os.MkdirAll(filepath.Dir(commitGraphChainPath), 0o755))
+ require.NoError(t, os.WriteFile(commitGraphChainPath, nil, 0o644))
+
+ for _, looseObjectPath := range tc.looseObjects {
+ looseObjectPath := filepath.Join(repoPath, "objects", looseObjectPath)
+ require.NoError(t, os.MkdirAll(filepath.Dir(looseObjectPath), 0o755))
+
+ looseObjectFile, err := os.Create(looseObjectPath)
+ require.NoError(t, err)
+ testhelper.MustClose(t, looseObjectFile)
+ }
+
+ repackNeeded, repackCfg, err := needsRepacking(repo)
+ require.NoError(t, err)
+ require.Equal(t, tc.expectedRepack, repackNeeded)
+ if tc.expectedRepack {
+ require.Equal(t, repackCommandConfig{
+ fullRepack: false,
+ writeBitmap: false,
+ }, repackCfg)
+ }
+ })
+ }
+}
+
+func TestPruneIfNeeded(t *testing.T) {
+ t.Parallel()
+
+ ctx := testhelper.Context(t)
+ cfg, _ := setupRepositoryServiceWithoutRepo(t)
+
+ for _, tc := range []struct {
+ desc string
+ isPool bool
+ looseObjects []string
+ expectedPrune bool
+ }{
+ {
+ desc: "no objects",
+ looseObjects: nil,
+ expectedPrune: false,
+ },
+ {
+ desc: "object not in 17 shard",
+ looseObjects: []string{
+ filepath.Join("ab/12345"),
+ },
+ expectedPrune: false,
+ },
+ {
+ desc: "object in 17 shard",
+ looseObjects: []string{
+ filepath.Join("17/12345"),
+ },
+ expectedPrune: false,
+ },
+ {
+ desc: "objects in different shards",
+ looseObjects: []string{
+ filepath.Join("ab/12345"),
+ filepath.Join("cd/12345"),
+ filepath.Join("12/12345"),
+ filepath.Join("17/12345"),
+ },
+ expectedPrune: false,
+ },
+ {
+ desc: "boundary",
+ looseObjects: []string{
+ filepath.Join("17/1"),
+ filepath.Join("17/2"),
+ filepath.Join("17/3"),
+ filepath.Join("17/4"),
+ },
+ expectedPrune: false,
+ },
+ {
+ desc: "exceeding boundary should cause repack",
+ looseObjects: []string{
+ filepath.Join("17/1"),
+ filepath.Join("17/2"),
+ filepath.Join("17/3"),
+ filepath.Join("17/4"),
+ filepath.Join("17/5"),
+ },
+ expectedPrune: true,
+ },
+ {
+ desc: "exceeding boundary on pool",
+ isPool: true,
+ looseObjects: []string{
+ filepath.Join("17/1"),
+ filepath.Join("17/2"),
+ filepath.Join("17/3"),
+ filepath.Join("17/4"),
+ filepath.Join("17/5"),
+ },
+ expectedPrune: false,
+ },
+ } {
+ t.Run(tc.desc, func(t *testing.T) {
+ relativePath := gittest.NewRepositoryName(t, true)
+ if tc.isPool {
+ relativePath = gittest.NewObjectPoolName(t)
+ }
+
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ RelativePath: relativePath,
+ })
+ repo := localrepo.NewTestRepo(t, cfg, repoProto)
+
+ for _, looseObjectPath := range tc.looseObjects {
+ looseObjectPath := filepath.Join(repoPath, "objects", looseObjectPath)
+ require.NoError(t, os.MkdirAll(filepath.Dir(looseObjectPath), 0o755))
+
+ looseObjectFile, err := os.Create(looseObjectPath)
+ require.NoError(t, err)
+ testhelper.MustClose(t, looseObjectFile)
+ }
+
+ didPrune, err := pruneIfNeeded(ctx, repo)
+ require.Equal(t, tc.expectedPrune, didPrune)
+ require.NoError(t, err)
+ })
+ }
+}
+
+func TestPackRefsIfNeeded(t *testing.T) {
+ t.Parallel()
+
+ ctx := testhelper.Context(t)
+ cfg, _ := setupRepositoryServiceWithoutRepo(t)
+
+ const kiloByte = 1024
+
+ for _, tc := range []struct {
+ packedRefsSize int64
+ requiredRefs int
+ }{
+ {
+ packedRefsSize: 1,
+ requiredRefs: 16,
+ },
+ {
+ packedRefsSize: 1 * kiloByte,
+ requiredRefs: 16,
+ },
+ {
+ packedRefsSize: 10 * kiloByte,
+ requiredRefs: 33,
+ },
+ {
+ packedRefsSize: 100 * kiloByte,
+ requiredRefs: 49,
+ },
+ {
+ packedRefsSize: 1000 * kiloByte,
+ requiredRefs: 66,
+ },
+ {
+ packedRefsSize: 10000 * kiloByte,
+ requiredRefs: 82,
+ },
+ {
+ packedRefsSize: 100000 * kiloByte,
+ requiredRefs: 99,
+ },
+ } {
+ t.Run(fmt.Sprintf("packed-refs with %d bytes", tc.packedRefsSize), func(t *testing.T) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg)
+ repo := localrepo.NewTestRepo(t, cfg, repoProto)
+
+ // Write an empty commit such that we can create valid refs.
+ commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents())
+ looseRefContent := []byte(commitID.String() + "\n")
+
+ // We first create a single big packfile which is used to determine the
+ // boundary of when we repack. We need to write a valid packed-refs file or
+ // otherwise git-pack-refs(1) would choke later on, so we just write the
+ // file such that every line is a separate ref of exactly 128 bytes in
+ // length (a divisor of 1024), referring to the commit we created above.
+ packedRefs, err := os.OpenFile(filepath.Join(repoPath, "packed-refs"), os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644)
+ require.NoError(t, err)
+ defer testhelper.MustClose(t, packedRefs)
+ for i := int64(0); i < tc.packedRefsSize/128; i++ {
+ packedRefLine := fmt.Sprintf("%s refs/something/this-line-is-padded-to-exactly-128-bytes-%030d\n", commitID.String(), i)
+ require.Len(t, packedRefLine, 128)
+ _, err := packedRefs.WriteString(packedRefLine)
+ require.NoError(t, err)
+ }
+ require.NoError(t, packedRefs.Sync())
+
+ // And then we create one less loose ref than we need to hit the boundary.
+ // This is done to assert that we indeed don't repack before hitting the
+ // boundary.
+ for i := 0; i < tc.requiredRefs-1; i++ {
+ looseRefPath := filepath.Join(repoPath, "refs", "heads", fmt.Sprintf("branch-%d", i))
+ require.NoError(t, os.WriteFile(looseRefPath, looseRefContent, 0o644))
+ }
+
+ didRepack, err := packRefsIfNeeded(ctx, repo)
+ require.NoError(t, err)
+ require.False(t, didRepack)
+
+ // Now we create the additional loose ref that causes us to hit the
+ // boundary. We should thus see that we want to repack now.
+ looseRefPath := filepath.Join(repoPath, "refs", "heads", "last-branch")
+ require.NoError(t, os.WriteFile(looseRefPath, looseRefContent, 0o644))
+
+ didRepack, err = packRefsIfNeeded(ctx, repo)
+ require.NoError(t, err)
+ require.True(t, didRepack)
+ })
+ }
+}
+
+func TestEstimateLooseObjectCount(t *testing.T) {
+ t.Parallel()
+
+ ctx := testhelper.Context(t)
+ cfg, _ := setupRepositoryServiceWithoutRepo(t)
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg)
+ repo := localrepo.NewTestRepo(t, cfg, repoProto)
+
+ t.Run("empty repository", func(t *testing.T) {
+ looseObjects, err := estimateLooseObjectCount(repo)
+ require.NoError(t, err)
+ require.Zero(t, looseObjects)
+ })
+
+ t.Run("object in different shard", func(t *testing.T) {
+ differentShard := filepath.Join(repoPath, "objects", "a0")
+ require.NoError(t, os.MkdirAll(differentShard, 0o755))
+
+ object, err := os.Create(filepath.Join(differentShard, "123456"))
+ require.NoError(t, err)
+ testhelper.MustClose(t, object)
+
+ looseObjects, err := estimateLooseObjectCount(repo)
+ require.NoError(t, err)
+ require.Zero(t, looseObjects)
+ })
+
+ t.Run("object in estimation shard", func(t *testing.T) {
+ estimationShard := filepath.Join(repoPath, "objects", "17")
+ require.NoError(t, os.MkdirAll(estimationShard, 0o755))
+
+ object, err := os.Create(filepath.Join(estimationShard, "123456"))
+ require.NoError(t, err)
+ testhelper.MustClose(t, object)
+
+ looseObjects, err := estimateLooseObjectCount(repo)
+ require.NoError(t, err)
+ require.Equal(t, int64(256), looseObjects)
+
+ // Create a second object in there.
+ object, err = os.Create(filepath.Join(estimationShard, "654321"))
+ require.NoError(t, err)
+ testhelper.MustClose(t, object)
+
+ looseObjects, err = estimateLooseObjectCount(repo)
+ require.NoError(t, err)
+ require.Equal(t, int64(512), looseObjects)
+ })
+}
diff --git a/internal/gitaly/service/repository/repack.go b/internal/gitaly/service/repository/repack.go
index 280fb6186..a3f428650 100644
--- a/internal/gitaly/service/repository/repack.go
+++ b/internal/gitaly/service/repository/repack.go
@@ -36,31 +36,52 @@ func log2Threads(numCPUs int) git.ValueFlag {
func (s *server) RepackFull(ctx context.Context, in *gitalypb.RepackFullRequest) (*gitalypb.RepackFullResponse, error) {
repo := s.localrepo(in.GetRepository())
- options := []git.Option{
- git.Flag{Name: "-A"},
- git.Flag{Name: "--pack-kept-objects"},
- git.Flag{Name: "-l"},
- log2Threads(runtime.NumCPU()),
+ cfg := repackCommandConfig{
+ fullRepack: true,
+ writeBitmap: in.GetCreateBitmap(),
}
- if err := repack(ctx, repo, in.GetCreateBitmap(), options...); err != nil {
+
+ if err := repack(ctx, repo, cfg); err != nil {
return nil, helper.ErrInternal(err)
}
+
return &gitalypb.RepackFullResponse{}, nil
}
func (s *server) RepackIncremental(ctx context.Context, in *gitalypb.RepackIncrementalRequest) (*gitalypb.RepackIncrementalResponse, error) {
repo := s.localrepo(in.GetRepository())
- if err := repack(ctx, repo, false); err != nil {
- return nil, helper.ErrInternal(err)
+ cfg := repackCommandConfig{
+ fullRepack: false,
+ writeBitmap: false,
+ }
+
+ if err := repack(ctx, repo, cfg); err != nil {
+ return nil, err
}
+
return &gitalypb.RepackIncrementalResponse{}, nil
}
-func repack(ctx context.Context, repo *localrepo.Repo, bitmap bool, args ...git.Option) error {
+type repackCommandConfig struct {
+ fullRepack bool
+ writeBitmap bool
+}
+
+func repack(ctx context.Context, repo *localrepo.Repo, cfg repackCommandConfig) error {
+ var options []git.Option
+ if cfg.fullRepack {
+ options = append(options,
+ git.Flag{Name: "-A"},
+ git.Flag{Name: "--pack-kept-objects"},
+ git.Flag{Name: "-l"},
+ log2Threads(runtime.NumCPU()),
+ )
+ }
+
if err := repo.ExecAndWait(ctx, git.SubCmd{
Name: "repack",
- Flags: append([]git.Option{git.Flag{Name: "-d"}}, args...),
- }, git.WithConfig(repackConfig(ctx, bitmap)...)); err != nil {
+ Flags: append([]git.Option{git.Flag{Name: "-d"}}, options...),
+ }, git.WithConfig(repackConfig(ctx, cfg.writeBitmap)...)); err != nil {
return err
}