diff options
author | Will Chandler <wchandler@gitlab.com> | 2022-06-22 18:43:09 +0300 |
---|---|---|
committer | Will Chandler <wchandler@gitlab.com> | 2022-06-22 18:43:09 +0300 |
commit | b265cf6ef8e14e8884b50cdaed9628515cb15639 (patch) | |
tree | 6b5790647c62912772eba55e4ae0356b6c3a5e84 | |
parent | 223039568deaccf47f5509900d19e866bde29bf5 (diff) | |
parent | c9c426cf6297984a2639f4036f34fc725fa3a8b2 (diff) |
Merge branch 'pks-repo-size-fix-alternate-refs-perf-regression' into 'master'
localrepo: Speed up calculating size for repo with excluded alternates
See merge request gitlab-org/gitaly!4657
-rw-r--r-- | internal/git/command_description.go | 10 | ||||
-rw-r--r-- | internal/git/localrepo/remote.go | 8 | ||||
-rw-r--r-- | internal/git/localrepo/repo.go | 45 | ||||
-rw-r--r-- | internal/git/localrepo/repo_test.go | 418 |
4 files changed, 309 insertions, 172 deletions
diff --git a/internal/git/command_description.go b/internal/git/command_description.go index 6ca1ce76c..c0939a704 100644 --- a/internal/git/command_description.go +++ b/internal/git/command_description.go @@ -86,6 +86,16 @@ var commandDescriptions = map[string]commandDescription{ flags: 0, opts: append([]GlobalOption{ + // We've observed performance issues when fetching into big repositories + // part of an object pool. The root cause of this seems to be the + // connectivity check, which by default will also include references of any + // alternates. Given that object pools often have hundreds of thousands of + // references, this is quite expensive to compute. Below config entry will + // disable listing of alternate refs: they shouldn't even be included in the + // negotiation phase, so they aren't going to matter in the connectivity + // check either. + ConfigPair{Key: "core.alternateRefsCommand", Value: "exit 0 #"}, + // While git-fetch(1) by default won't write commit graphs, both CNG and // Omnibus set this value to true. This has caused performance issues when // doing internal fetches, and furthermore it's not encouraged to run such diff --git a/internal/git/localrepo/remote.go b/internal/git/localrepo/remote.go index b9d4428aa..53b1ce67e 100644 --- a/internal/git/localrepo/remote.go +++ b/internal/git/localrepo/remote.go @@ -128,14 +128,6 @@ func (repo *Repo) FetchInternal( commandOptions := []git.CmdOpt{ git.WithEnv(opts.Env...), git.WithStderr(opts.Stderr), - // We've observed performance issues when fetching into big repositories part of an - // object pool. The root cause of this seems to be the connectivity check, which by - // default will also include references of any alternates. Given that object pools - // often have hundreds of thousands of references, this is quite expensive to - // compute. Below config entry will disable listing of alternate refs: they - // shouldn't even be included in the negotiation phase, so they aren't going to - // matter in the connectivity check either. - git.WithConfig(git.ConfigPair{Key: "core.alternateRefsCommand", Value: "exit 0 #"}), git.WithInternalFetchWithSidechannel( &gitalypb.SSHUploadPackWithSidechannelRequest{ Repository: remoteRepo, diff --git a/internal/git/localrepo/repo.go b/internal/git/localrepo/repo.go index 14bf8ad10..6d6ac6da3 100644 --- a/internal/git/localrepo/repo.go +++ b/internal/git/localrepo/repo.go @@ -3,7 +3,10 @@ package localrepo import ( "bytes" "context" + "errors" "fmt" + "io/fs" + "os" "strconv" "strings" "testing" @@ -129,7 +132,6 @@ func (repo *Repo) Size(ctx context.Context, opts ...RepoSizeOption) (int64, erro var stdout bytes.Buffer var cfg repoSizeConfig - for _, opt := range opts { opt(&cfg) } @@ -143,17 +145,46 @@ func (repo *Repo) Size(ctx context.Context, opts ...RepoSizeOption) (int64, erro } if cfg.ExcludeAlternates { - options = append(options, - git.Flag{Name: "--not"}, - git.Flag{Name: "--alternate-refs"}, - git.Flag{Name: "--not"}, - ) + alternatesPath, err := repo.InfoAlternatesPath() + if err != nil { + return 0, fmt.Errorf("getting alternates path: %w", err) + } + + // when excluding alternatives we need to be careful with using the bitmap index. If + // the repository is indeed linked to an alternative object directory, then we know + // that only the linked-to object directory will have bitmaps. Consequentially, this + // bitmap will only ever cover objects that are part of the alternate repository and + // can thus by definition not contain any objects that are only part of the repo + // that is linking to it. Unfortunately, this case causes us to run into an edge + // case in Git where the command takes significantly longer to compute the disk size + // when using bitmaps compared to when not using bitmaps. + // + // To work around this case we thus don't use a bitmap index in case we find that + // the repository has an alternates file. + if _, err := os.Stat(alternatesPath); err != nil && errors.Is(err, fs.ErrNotExist) { + // The alternates file does not exist. We can thus use the bitmap index and + // don't have to specify `--not --alternate-refs` given that there aren't + // any anyway. + options = append(options, git.Flag{Name: "--use-bitmap-index"}) + } else { + // We either have a bitmap index or we have run into any error that is not + // `fs.ErrNotExist`. In that case we don't use a bitmap index, but will + // exclude objects reachable from alternate refs. + options = append(options, + git.Flag{Name: "--not"}, + git.Flag{Name: "--alternate-refs"}, + git.Flag{Name: "--not"}, + ) + } + } else { + // If we don't exclude objects reachable from alternate refs we can always enable + // use of the bitmap index. + options = append(options, git.Flag{Name: "--use-bitmap-index"}) } options = append(options, git.Flag{Name: "--all"}, git.Flag{Name: "--objects"}, - git.Flag{Name: "--use-bitmap-index"}, git.Flag{Name: "--disk-usage"}, ) diff --git a/internal/git/localrepo/repo_test.go b/internal/git/localrepo/repo_test.go index 5b334bcfe..bc986eddf 100644 --- a/internal/git/localrepo/repo_test.go +++ b/internal/git/localrepo/repo_test.go @@ -1,18 +1,19 @@ package localrepo import ( - "bytes" "context" + "fmt" "os" "path/filepath" + "strings" "testing" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/v15/internal/git" "gitlab.com/gitlab-org/gitaly/v15/internal/git/catfile" "gitlab.com/gitlab-org/gitaly/v15/internal/git/gittest" "gitlab.com/gitlab-org/gitaly/v15/internal/gitaly/config" + "gitlab.com/gitlab-org/gitaly/v15/internal/helper/text" "gitlab.com/gitlab-org/gitaly/v15/internal/testhelper" "gitlab.com/gitlab-org/gitaly/v15/internal/testhelper/testcfg" "gitlab.com/gitlab-org/gitaly/v15/proto/go/gitalypb" @@ -43,198 +44,301 @@ func TestRepo(t *testing.T) { } func TestSize(t *testing.T) { + t.Parallel() + cfg := testcfg.Build(t) - gitCmdFactory := gittest.NewCommandFactory(t, cfg) catfileCache := catfile.NewCache(cfg) t.Cleanup(catfileCache.Stop) + ctx := testhelper.Context(t) + + commandArgFile := filepath.Join(testhelper.TempDir(t), "args") + interceptingFactory := gittest.NewInterceptingCommandFactory(ctx, t, cfg, func(execEnv git.ExecutionEnvironment) string { + return fmt.Sprintf(`#!/bin/bash + echo "$@" >%q + exec %q "$@" + `, commandArgFile, execEnv.BinaryPath) + }) + testCases := []struct { - desc string - setup func(repoPath string, t *testing.T) - expectedSize int64 + desc string + setup func(t *testing.T) *gitalypb.Repository + opts []RepoSizeOption + expectedSize int64 + expectedUseBitmap bool }{ { desc: "empty repository", expectedSize: 0, + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, _ := gittest.InitRepo(t, cfg, cfg.Storages[0]) + return repoProto + }, + expectedUseBitmap: true, }, { - desc: "one committed file", - setup: func(repoPath string, t *testing.T) { - require.NoError(t, os.WriteFile( - filepath.Join(repoPath, "file"), - bytes.Repeat([]byte("a"), 1000), - 0o644, - )) - - cmd := gittest.NewCommand(t, cfg, "-C", repoPath, "add", "file") - require.NoError(t, cmd.Run()) - cmd = gittest.NewCommand(t, cfg, "-C", repoPath, "commit", "-m", "initial") - require.NoError(t, cmd.Run()) + desc: "referenced commit", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + + gittest.WriteCommit(t, cfg, repoPath, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "file", Mode: "100644", Content: strings.Repeat("a", 1000)}, + ), + gittest.WithBranch("main"), + ) + + return repoProto }, - expectedSize: 202, + expectedSize: 203, + expectedUseBitmap: true, }, { - desc: "one large loose blob", - setup: func(repoPath string, t *testing.T) { + desc: "unreferenced commit", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + + gittest.WriteCommit(t, cfg, repoPath, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "file", Mode: "100644", Content: strings.Repeat("a", 1000)}, + ), + ) + + return repoProto + }, + expectedSize: 0, + expectedUseBitmap: true, + }, + { + desc: "modification to blob without repack", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + + rootCommitID := gittest.WriteCommit(t, cfg, repoPath, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "file", Mode: "100644", Content: strings.Repeat("a", 1000)}, + ), + ) + + gittest.WriteCommit(t, cfg, repoPath, + gittest.WithParents(rootCommitID), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "file", Mode: "100644", Content: strings.Repeat("a", 1001)}, + ), + gittest.WithMessage("modification"), + gittest.WithBranch("main"), + ) + + return repoProto + }, + expectedSize: 439, + expectedUseBitmap: true, + }, + { + desc: "modification to blob after repack", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + + rootCommitID := gittest.WriteCommit(t, cfg, repoPath, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "file", Mode: "100644", Content: strings.Repeat("a", 1000)}, + ), + ) + + gittest.WriteCommit(t, cfg, repoPath, + gittest.WithParents(rootCommitID), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "file", Mode: "100644", Content: strings.Repeat("a", 1001)}, + ), + gittest.WithMessage("modification"), + gittest.WithBranch("main"), + ) + + gittest.Exec(t, cfg, "-C", repoPath, "repack", "-a", "-d") + + return repoProto + }, + expectedSize: 398, + expectedUseBitmap: true, + }, + { + desc: "excluded single ref", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + + gittest.WriteCommit(t, cfg, repoPath, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "1kbblob", Mode: "100644", Content: strings.Repeat("a", 1000)}, + ), + gittest.WithBranch("exclude-me"), + ) + + gittest.WriteCommit(t, cfg, repoPath, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "1kbblob", Mode: "100644", Content: strings.Repeat("x", 2000)}, + ), + gittest.WithBranch("include-me"), + ) + + return repoProto + }, + opts: []RepoSizeOption{ + WithExcludeRefs("refs/heads/exclude-me"), + }, + expectedSize: 217, + expectedUseBitmap: true, + }, + { + desc: "excluded everything", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + + gittest.WriteCommit(t, cfg, repoPath, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "1kbblob", Mode: "100644", Content: strings.Repeat("a", 1000)}, + ), + gittest.WithBranch("exclude-me"), + ) + + return repoProto + }, + opts: []RepoSizeOption{ + WithExcludeRefs("refs/heads/*"), + }, + expectedSize: 0, + expectedUseBitmap: true, + }, + { + desc: "repo with alternate", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + _, poolPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + require.NoError(t, os.WriteFile( - filepath.Join(repoPath, "file"), - bytes.Repeat([]byte("a"), 1000), - 0o644, + filepath.Join(repoPath, "objects", "info", "alternates"), + []byte(filepath.Join(poolPath, "objects")), + os.ModePerm, )) - cmd := gittest.NewCommand(t, cfg, "-C", repoPath, "checkout", "-b", "branch-a") - require.NoError(t, cmd.Run()) - cmd = gittest.NewCommand(t, cfg, "-C", repoPath, "add", "file") - require.NoError(t, cmd.Run()) - cmd = gittest.NewCommand(t, cfg, "-C", repoPath, "commit", "-m", "initial") - require.NoError(t, cmd.Run()) - cmd = gittest.NewCommand(t, cfg, "-C", repoPath, "update-ref", "-d", "refs/heads/branch-a") - require.NoError(t, cmd.Run()) + for _, path := range []string{repoPath, poolPath} { + gittest.WriteCommit(t, cfg, path, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "1kbblob", Mode: "100644", Content: strings.Repeat("a", 1000)}, + ), + gittest.WithBranch("main"), + ) + } + + return repoProto }, - expectedSize: 0, + // While both repositories have the same contents, we should still return + // the actual repository's size because we don't exclude the alternate. + expectedSize: 207, + // Even though we have an alternate, we should still use bitmap indices + // given that we don't use `--not --alternate-refs`. + expectedUseBitmap: true, }, { - desc: "modification to blob without repack", - setup: func(repoPath string, t *testing.T) { + desc: "exclude alternate with identical contents", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + _, poolPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + require.NoError(t, os.WriteFile( - filepath.Join(repoPath, "file"), - bytes.Repeat([]byte("a"), 1000), - 0o644, + filepath.Join(repoPath, "objects", "info", "alternates"), + []byte(filepath.Join(poolPath, "objects")), + os.ModePerm, )) - cmd := gittest.NewCommand(t, cfg, "-C", repoPath, "add", "file") - require.NoError(t, cmd.Run()) - cmd = gittest.NewCommand(t, cfg, "-C", repoPath, "commit", "-m", "initial") - require.NoError(t, cmd.Run()) - - f, err := os.OpenFile( - filepath.Join(repoPath, "file"), - os.O_APPEND|os.O_WRONLY, - 0o644) - require.NoError(t, err) - defer f.Close() - _, err = f.WriteString("a") - assert.NoError(t, err) - - cmd = gittest.NewCommand(t, cfg, "-C", repoPath, "commit", "-am", "modification") - require.NoError(t, cmd.Run()) + // We write the same object into both repositories, so we should + // exclude it from our size calculations. + for _, path := range []string{repoPath, poolPath} { + gittest.WriteCommit(t, cfg, path, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "1kbblob", Mode: "100644", Content: strings.Repeat("a", 1000)}, + ), + gittest.WithBranch("main"), + ) + } + + return repoProto + }, + opts: []RepoSizeOption{ + WithoutAlternates(), }, - expectedSize: 437, + expectedSize: 0, + expectedUseBitmap: false, }, { - desc: "modification to blob after repack", - setup: func(repoPath string, t *testing.T) { + desc: "exclude alternate with additional contents", + setup: func(t *testing.T) *gitalypb.Repository { + repoProto, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + _, poolPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + require.NoError(t, os.WriteFile( - filepath.Join(repoPath, "file"), - bytes.Repeat([]byte("a"), 1000), - 0o644, + filepath.Join(repoPath, "objects", "info", "alternates"), + []byte(filepath.Join(poolPath, "objects")), + os.ModePerm, )) - cmd := gittest.NewCommand(t, cfg, "-C", repoPath, "add", "file") - require.NoError(t, cmd.Run()) - cmd = gittest.NewCommand(t, cfg, "-C", repoPath, "commit", "-m", "initial") - require.NoError(t, cmd.Run()) - - f, err := os.OpenFile( - filepath.Join(repoPath, "file"), - os.O_APPEND|os.O_WRONLY, - 0o644) - require.NoError(t, err) - defer f.Close() - _, err = f.WriteString("a") - assert.NoError(t, err) - - cmd = gittest.NewCommand(t, cfg, "-C", repoPath, "commit", "-am", "modification") - require.NoError(t, cmd.Run()) - - cmd = gittest.NewCommand(t, cfg, "-C", repoPath, "repack", "-a", "-d") - require.NoError(t, cmd.Run()) + for i, path := range []string{repoPath, poolPath} { + // We first write one blob into the repo that is the same + // across both repositories. + rootCommitID := gittest.WriteCommit(t, cfg, path, + gittest.WithParents(), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "1kbblob", Mode: "100644", Content: strings.Repeat("a", 1000)}, + ), + ) + + // But this time we also write a second commit into each of + // the repositories that is not the same to simulate history + // that has diverged. + gittest.WriteCommit(t, cfg, path, + gittest.WithParents(rootCommitID), + gittest.WithTreeEntries( + gittest.TreeEntry{Path: "1kbblob", Mode: "100644", Content: fmt.Sprintf("%d", i)}, + ), + gittest.WithBranch("main"), + ) + } + + return repoProto }, - expectedSize: 391, + opts: []RepoSizeOption{ + WithoutAlternates(), + }, + expectedSize: 224, + expectedUseBitmap: false, }, } for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { - pbRepo, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0], gittest.InitRepoOpts{ - WithWorktree: true, - }) - repo := New(config.NewLocator(cfg), gitCmdFactory, catfileCache, pbRepo) - if tc.setup != nil { - tc.setup(repoPath, t) - } + require.NoError(t, os.RemoveAll(commandArgFile)) + + repoProto := tc.setup(t) + repo := New(config.NewLocator(cfg), interceptingFactory, catfileCache, repoProto) - ctx := testhelper.Context(t) - size, err := repo.Size(ctx) + size, err := repo.Size(ctx, tc.opts...) require.NoError(t, err) - assert.Equal(t, tc.expectedSize, size) + require.Equal(t, tc.expectedSize, size) + + commandArgs := text.ChompBytes(testhelper.MustReadFile(t, commandArgFile)) + if tc.expectedUseBitmap { + require.Contains(t, commandArgs, "--use-bitmap-index") + } else { + require.NotContains(t, commandArgs, "--use-bitmap-index") + } }) } } - -func TestSize_excludeRefs(t *testing.T) { - cfg := testcfg.Build(t) - gitCmdFactory := gittest.NewCommandFactory(t, cfg) - catfileCache := catfile.NewCache(cfg) - t.Cleanup(catfileCache.Stop) - - pbRepo, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0]) - blob := bytes.Repeat([]byte("a"), 1000) - blobOID := gittest.WriteBlob(t, cfg, repoPath, blob) - treeOID := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ - { - OID: blobOID, - Mode: "100644", - Path: "1kbblob", - }, - }) - commitOID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTree(treeOID)) - - repo := New(config.NewLocator(cfg), gitCmdFactory, catfileCache, pbRepo) - - ctx := testhelper.Context(t) - sizeBeforeKeepAround, err := repo.Size(ctx) - require.NoError(t, err) - - gittest.WriteRef(t, cfg, repoPath, git.ReferenceName("refs/keep-around/keep1"), commitOID) - - sizeWithKeepAround, err := repo.Size(ctx) - require.NoError(t, err) - assert.Less(t, sizeBeforeKeepAround, sizeWithKeepAround) - - sizeWithoutKeepAround, err := repo.Size(ctx, WithExcludeRefs("refs/keep-around/*")) - require.NoError(t, err) - - assert.Equal(t, sizeBeforeKeepAround, sizeWithoutKeepAround) -} - -func TestSize_excludeAlternates(t *testing.T) { - cfg := testcfg.Build(t) - gitCmdFactory := gittest.NewCommandFactory(t, cfg) - catfileCache := catfile.NewCache(cfg) - t.Cleanup(catfileCache.Stop) - locator := config.NewLocator(cfg) - - pbRepo, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0]) - _, altRepoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0]) - - require.NoError(t, os.WriteFile( - filepath.Join(repoPath, "objects", "info", "alternates"), - []byte(filepath.Join(altRepoPath, "objects")), - os.ModePerm, - )) - - repo := New(locator, gitCmdFactory, catfileCache, pbRepo) - - ctx := testhelper.Context(t) - - gittest.Exec(t, cfg, "-C", repoPath, "gc") - - sizeIncludingAlternates, err := repo.Size(ctx) - require.NoError(t, err) - assert.Greater(t, sizeIncludingAlternates, int64(0)) - - sizeExcludingAlternates, err := repo.Size(ctx, WithoutAlternates()) - require.NoError(t, err) - assert.Equal(t, int64(0), sizeExcludingAlternates) -} |