diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-02-03 16:54:28 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-02-10 11:03:42 +0300 |
commit | 7b82783b7bd75040708f8a9c01c14f2ac9516698 (patch) | |
tree | 6069bd7a00cb792884431071305803cee3cdc1d6 | |
parent | ef53a8faf52c348009eb87728be2f6ff79e876a1 (diff) |
repository: Use heuristic to pack refs in OptimizeRepository
Except for the housekeeping tasks which prune corrupt references,
OptimizeRepository is currently completely ignoring references. As a
result, it could be that the repository is well-packed with regards to
its objects but is a mess with regards to its references.
Git has two ways to store references right now: loose references are
the "default" backend, where every changed reference is written into a
single file. Loose references are inefficient though: reading a single
reference requires us to traverse multiple directories, open the file,
scan its contents and so on. Git thus also knows a packed format of
references, where the git-pack-refs(1) command will collect all loose
refs and write them into a single packed-refs file. Reading this format
is a lot more efficient. We should thus make sure that we regularly
repack refs.
Introduce a new heuristic which packs loose references: whenever there
are too many of them we run git-pack-refs(1) to pack them. Packing refs
scales with the number of references which exist in the repository. We
thus use the size of the existing packed-refs file to inform ourselves
about how long it's likely to take to pack them. The bigger its size,
the more loose refs we accept in the repository. This is a tradeoff that
trades runtime of OptimizeRepository against git-pack-refs(1).
We use a similar heuristic here as for packing objects by using a
logarithmic function. This again has the effect that we're ramping up
the number of accepted loose references before a repack rather fast, but
then put an effective limit on how many refs are accepted in the
repository. The scaling factor of this may need to be adjusted, but
should be a good first approximation of what we need.
Note that we do not take into account how many loose refs there are to
estimate the number of total references: the more refs we have there the
more we want to repack them, especially so if we have no or only a very
small packed-refs file. It thus shouldn't be required to also take those
into accounts.
Changelog: changed
-rw-r--r-- | internal/gitaly/service/repository/optimize.go | 83 | ||||
-rw-r--r-- | internal/gitaly/service/repository/optimize_test.go | 89 |
2 files changed, 172 insertions, 0 deletions
diff --git a/internal/gitaly/service/repository/optimize.go b/internal/gitaly/service/repository/optimize.go index 3d8f10891..e1ae73552 100644 --- a/internal/gitaly/service/repository/optimize.go +++ b/internal/gitaly/service/repository/optimize.go @@ -1,9 +1,11 @@ package repository import ( + "bytes" "context" "errors" "fmt" + "io/fs" "math" "os" "path/filepath" @@ -55,6 +57,7 @@ func (s *server) optimizeRepository(ctx context.Context, repo *localrepo.Repo) e optimizations := struct { PackedObjects bool `json:"packed_objects"` PrunedObjects bool `json:"pruned_objects"` + PackedRefs bool `json:"packed_refs"` }{} defer func() { ctxlogrus.Extract(ctx).WithField("optimizations", optimizations).Info("optimized repository") @@ -76,6 +79,12 @@ func (s *server) optimizeRepository(ctx context.Context, repo *localrepo.Repo) e } optimizations.PrunedObjects = didPrune + didPackRefs, err := packRefsIfNeeded(ctx, repo) + if err != nil { + return fmt.Errorf("could not pack refs: %w", err) + } + optimizations.PackedRefs = didPackRefs + return nil } @@ -334,3 +343,77 @@ func pruneIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) { return true, nil } + +func packRefsIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) { + repoPath, err := repo.Path() + if err != nil { + return false, fmt.Errorf("getting repository path: %w", err) + } + refsPath := filepath.Join(repoPath, "refs") + + looseRefs := int64(0) + if err := filepath.WalkDir(refsPath, func(path string, entry fs.DirEntry, err error) error { + if err != nil { + return err + } + + if !entry.IsDir() { + looseRefs++ + } + + return nil + }); err != nil { + return false, fmt.Errorf("counting loose refs: %w", err) + } + + // If there aren't any loose refs then there is nothing we need to do. + if looseRefs == 0 { + return false, nil + } + + packedRefsSize := int64(0) + if stat, err := os.Stat(filepath.Join(repoPath, "packed-refs")); err != nil { + if !errors.Is(err, os.ErrNotExist) { + return false, fmt.Errorf("getting packed-refs size: %w", err) + } + } else { + packedRefsSize = stat.Size() + } + + // Packing loose references into the packed-refs file scales with the number of references + // we're about to write. We thus decide whether we repack refs by weighing the current size + // of the packed-refs file against the number of loose references. This is done such that we + // do not repack too often on repositories with a huge number of references, where we can + // expect a lot of churn in the number of references. + // + // As a heuristic, we repack if the number of loose references in the repository exceeds + // `log(packed_refs_size_in_bytes/100)/log(1.15)`, which scales as following (number of refs + // is estimated with 100 bytes per reference): + // + // - 1kB ~ 10 packed refs: 16 refs + // - 10kB ~ 100 packed refs: 33 refs + // - 100kB ~ 1k packed refs: 49 refs + // - 1MB ~ 10k packed refs: 66 refs + // - 10MB ~ 100k packed refs: 82 refs + // - 100MB ~ 1m packed refs: 99 refs + // + // We thus allow roughly 16 additional loose refs per factor of ten of packed refs. + // + // This heuristic may likely need tweaking in the future, but should serve as a good first + // iteration. + if int64(math.Max(16, math.Log(float64(packedRefsSize)/100)/math.Log(1.15))) > looseRefs { + return false, nil + } + + var stderr bytes.Buffer + if err := repo.ExecAndWait(ctx, git.SubCmd{ + Name: "pack-refs", + Flags: []git.Option{ + git.Flag{Name: "--all"}, + }, + }, git.WithStderr(&stderr)); err != nil { + return false, fmt.Errorf("packing refs: %w, stderr: %q", err, stderr.String()) + } + + return true, nil +} diff --git a/internal/gitaly/service/repository/optimize_test.go b/internal/gitaly/service/repository/optimize_test.go index e09cf2a18..f7e4152e0 100644 --- a/internal/gitaly/service/repository/optimize_test.go +++ b/internal/gitaly/service/repository/optimize_test.go @@ -605,6 +605,95 @@ func TestPruneIfNeeded(t *testing.T) { } } +func TestPackRefsIfNeeded(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + cfg, _ := setupRepositoryServiceWithoutRepo(t) + + const kiloByte = 1024 + + for _, tc := range []struct { + packedRefsSize int64 + requiredRefs int + }{ + { + packedRefsSize: 1, + requiredRefs: 16, + }, + { + packedRefsSize: 1 * kiloByte, + requiredRefs: 16, + }, + { + packedRefsSize: 10 * kiloByte, + requiredRefs: 33, + }, + { + packedRefsSize: 100 * kiloByte, + requiredRefs: 49, + }, + { + packedRefsSize: 1000 * kiloByte, + requiredRefs: 66, + }, + { + packedRefsSize: 10000 * kiloByte, + requiredRefs: 82, + }, + { + packedRefsSize: 100000 * kiloByte, + requiredRefs: 99, + }, + } { + t.Run(fmt.Sprintf("packed-refs with %d bytes", tc.packedRefsSize), func(t *testing.T) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + // Write an empty commit such that we can create valid refs. + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents()) + looseRefContent := []byte(commitID.String() + "\n") + + // We first create a single big packfile which is used to determine the + // boundary of when we repack. We need to write a valid packed-refs file or + // otherwise git-pack-refs(1) would choke later on, so we just write the + // file such that every line is a separate ref of exactly 128 bytes in + // length (a divisor of 1024), referring to the commit we created above. + packedRefs, err := os.OpenFile(filepath.Join(repoPath, "packed-refs"), os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644) + require.NoError(t, err) + defer testhelper.MustClose(t, packedRefs) + for i := int64(0); i < tc.packedRefsSize/128; i++ { + packedRefLine := fmt.Sprintf("%s refs/something/this-line-is-padded-to-exactly-128-bytes-%030d\n", commitID.String(), i) + require.Len(t, packedRefLine, 128) + _, err := packedRefs.WriteString(packedRefLine) + require.NoError(t, err) + } + require.NoError(t, packedRefs.Sync()) + + // And then we create one less loose ref than we need to hit the boundary. + // This is done to assert that we indeed don't repack before hitting the + // boundary. + for i := 0; i < tc.requiredRefs-1; i++ { + looseRefPath := filepath.Join(repoPath, "refs", "heads", fmt.Sprintf("branch-%d", i)) + require.NoError(t, os.WriteFile(looseRefPath, looseRefContent, 0o644)) + } + + didRepack, err := packRefsIfNeeded(ctx, repo) + require.NoError(t, err) + require.False(t, didRepack) + + // Now we create the additional loose ref that causes us to hit the + // boundary. We should thus see that we want to repack now. + looseRefPath := filepath.Join(repoPath, "refs", "heads", "last-branch") + require.NoError(t, os.WriteFile(looseRefPath, looseRefContent, 0o644)) + + didRepack, err = packRefsIfNeeded(ctx, repo) + require.NoError(t, err) + require.True(t, didRepack) + }) + } +} + func TestEstimateLooseObjectCount(t *testing.T) { t.Parallel() |