repository: Use heuristic to pack refs in OptimizeRepository

Except for the housekeeping tasks which prune corrupt references, OptimizeRepository is currently completely ignoring references. As a result, it could be that the repository is well-packed with regards to its objects but is a mess with regards to its references. Git has two ways to store references right now: loose references are the "default" backend, where every changed reference is written into a single file. Loose references are inefficient though: reading a single reference requires us to traverse multiple directories, open the file, scan its contents and so on. Git thus also knows a packed format of references, where the git-pack-refs(1) command will collect all loose refs and write them into a single packed-refs file. Reading this format is a lot more efficient. We should thus make sure that we regularly repack refs. Introduce a new heuristic which packs loose references: whenever there are too many of them we run git-pack-refs(1) to pack them. Packing refs scales with the number of references which exist in the repository. We thus use the size of the existing packed-refs file to inform ourselves about how long it's likely to take to pack them. The bigger its size, the more loose refs we accept in the repository. This is a tradeoff that trades runtime of OptimizeRepository against git-pack-refs(1). We use a similar heuristic here as for packing objects by using a logarithmic function. This again has the effect that we're ramping up the number of accepted loose references before a repack rather fast, but then put an effective limit on how many refs are accepted in the repository. The scaling factor of this may need to be adjusted, but should be a good first approximation of what we need. Note that we do not take into account how many loose refs there are to estimate the number of total references: the more refs we have there the more we want to repack them, especially so if we have no or only a very small packed-refs file. It thus shouldn't be required to also take those into accounts. Changelog: changed
author: Patrick Steinhardt <psteinhardt@gitlab.com> 2022-02-03 16:54:28 +0300
committer: Patrick Steinhardt <psteinhardt@gitlab.com> 2022-02-10 11:03:42 +0300
commit: 7b82783b7bd75040708f8a9c01c14f2ac9516698 (patch)
tree: 6069bd7a00cb792884431071305803cee3cdc1d6
parent: ef53a8faf52c348009eb87728be2f6ff79e876a1 (diff)
2 files changed, 172 insertions, 0 deletions
diff --git a/internal/gitaly/service/repository/optimize.go b/internal/gitaly/service/repository/optimize.go
index 3d8f10891..e1ae73552 100644
--- a/internal/gitaly/service/repository/optimize.go
+++ b/internal/gitaly/service/repository/optimize.go
@@ -1,9 +1,11 @@
 package repository
 
 import (
+	"bytes"
 	"context"
 	"errors"
 	"fmt"
+	"io/fs"
 	"math"
 	"os"
 	"path/filepath"
@@ -55,6 +57,7 @@ func (s *server) optimizeRepository(ctx context.Context, repo *localrepo.Repo) e
 	optimizations := struct {
 		PackedObjects bool `json:"packed_objects"`
 		PrunedObjects bool `json:"pruned_objects"`
+		PackedRefs    bool `json:"packed_refs"`
 	}{}
 	defer func() {
 		ctxlogrus.Extract(ctx).WithField("optimizations", optimizations).Info("optimized repository")
@@ -76,6 +79,12 @@ func (s *server) optimizeRepository(ctx context.Context, repo *localrepo.Repo) e
 	}
 	optimizations.PrunedObjects = didPrune
 
+	didPackRefs, err := packRefsIfNeeded(ctx, repo)
+	if err != nil {
+		return fmt.Errorf("could not pack refs: %w", err)
+	}
+	optimizations.PackedRefs = didPackRefs
+
 	return nil
 }
 
@@ -334,3 +343,77 @@ func pruneIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) {
 
 	return true, nil
 }
+
+func packRefsIfNeeded(ctx context.Context, repo *localrepo.Repo) (bool, error) {
+	repoPath, err := repo.Path()
+	if err != nil {
+		return false, fmt.Errorf("getting repository path: %w", err)
+	}
+	refsPath := filepath.Join(repoPath, "refs")
+
+	looseRefs := int64(0)
+	if err := filepath.WalkDir(refsPath, func(path string, entry fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+
+		if !entry.IsDir() {
+			looseRefs++
+		}
+
+		return nil
+	}); err != nil {
+		return false, fmt.Errorf("counting loose refs: %w", err)
+	}
+
+	// If there aren't any loose refs then there is nothing we need to do.
+	if looseRefs == 0 {
+		return false, nil
+	}
+
+	packedRefsSize := int64(0)
+	if stat, err := os.Stat(filepath.Join(repoPath, "packed-refs")); err != nil {
+		if !errors.Is(err, os.ErrNotExist) {
+			return false, fmt.Errorf("getting packed-refs size: %w", err)
+		}
+	} else {
+		packedRefsSize = stat.Size()
+	}
+
+	// Packing loose references into the packed-refs file scales with the number of references
+	// we're about to write. We thus decide whether we repack refs by weighing the current size
+	// of the packed-refs file against the number of loose references. This is done such that we
+	// do not repack too often on repositories with a huge number of references, where we can
+	// expect a lot of churn in the number of references.
+	//
+	// As a heuristic, we repack if the number of loose references in the repository exceeds
+	// `log(packed_refs_size_in_bytes/100)/log(1.15)`, which scales as following (number of refs
+	// is estimated with 100 bytes per reference):
+	//
+	// - 1kB ~ 10 packed refs: 16 refs
+	// - 10kB ~ 100 packed refs: 33 refs
+	// - 100kB ~ 1k packed refs: 49 refs
+	// - 1MB ~ 10k packed refs: 66 refs
+	// - 10MB ~ 100k packed refs: 82 refs
+	// - 100MB ~ 1m packed refs: 99 refs
+	//
+	// We thus allow roughly 16 additional loose refs per factor of ten of packed refs.
+	//
+	// This heuristic may likely need tweaking in the future, but should serve as a good first
+	// iteration.
+	if int64(math.Max(16, math.Log(float64(packedRefsSize)/100)/math.Log(1.15))) > looseRefs {
+		return false, nil
+	}
+
+	var stderr bytes.Buffer
+	if err := repo.ExecAndWait(ctx, git.SubCmd{
+		Name: "pack-refs",
+		Flags: []git.Option{
+			git.Flag{Name: "--all"},
+		},
+	}, git.WithStderr(&stderr)); err != nil {
+		return false, fmt.Errorf("packing refs: %w, stderr: %q", err, stderr.String())
+	}
+
+	return true, nil
+}
diff --git a/internal/gitaly/service/repository/optimize_test.go b/internal/gitaly/service/repository/optimize_test.go
index e09cf2a18..f7e4152e0 100644
--- a/internal/gitaly/service/repository/optimize_test.go
+++ b/internal/gitaly/service/repository/optimize_test.go
@@ -605,6 +605,95 @@ func TestPruneIfNeeded(t *testing.T) {
 	}
 }
 
+func TestPackRefsIfNeeded(t *testing.T) {
+	t.Parallel()
+
+	ctx := testhelper.Context(t)
+	cfg, _ := setupRepositoryServiceWithoutRepo(t)
+
+	const kiloByte = 1024
+
+	for _, tc := range []struct {
+		packedRefsSize int64
+		requiredRefs   int
+	}{
+		{
+			packedRefsSize: 1,
+			requiredRefs:   16,
+		},
+		{
+			packedRefsSize: 1 * kiloByte,
+			requiredRefs:   16,
+		},
+		{
+			packedRefsSize: 10 * kiloByte,
+			requiredRefs:   33,
+		},
+		{
+			packedRefsSize: 100 * kiloByte,
+			requiredRefs:   49,
+		},
+		{
+			packedRefsSize: 1000 * kiloByte,
+			requiredRefs:   66,
+		},
+		{
+			packedRefsSize: 10000 * kiloByte,
+			requiredRefs:   82,
+		},
+		{
+			packedRefsSize: 100000 * kiloByte,
+			requiredRefs:   99,
+		},
+	} {
+		t.Run(fmt.Sprintf("packed-refs with %d bytes", tc.packedRefsSize), func(t *testing.T) {
+			repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg)
+			repo := localrepo.NewTestRepo(t, cfg, repoProto)
+
+			// Write an empty commit such that we can create valid refs.
+			commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents())
+			looseRefContent := []byte(commitID.String() + "\n")
+
+			// We first create a single big packfile which is used to determine the
+			// boundary of when we repack. We need to write a valid packed-refs file or
+			// otherwise git-pack-refs(1) would choke later on, so we just write the
+			// file such that every line is a separate ref of exactly 128 bytes in
+			// length (a divisor of 1024), referring to the commit we created above.
+			packedRefs, err := os.OpenFile(filepath.Join(repoPath, "packed-refs"), os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644)
+			require.NoError(t, err)
+			defer testhelper.MustClose(t, packedRefs)
+			for i := int64(0); i < tc.packedRefsSize/128; i++ {
+				packedRefLine := fmt.Sprintf("%s refs/something/this-line-is-padded-to-exactly-128-bytes-%030d\n", commitID.String(), i)
+				require.Len(t, packedRefLine, 128)
+				_, err := packedRefs.WriteString(packedRefLine)
+				require.NoError(t, err)
+			}
+			require.NoError(t, packedRefs.Sync())
+
+			// And then we create one less loose ref than we need to hit the boundary.
+			// This is done to assert that we indeed don't repack before hitting the
+			// boundary.
+			for i := 0; i < tc.requiredRefs-1; i++ {
+				looseRefPath := filepath.Join(repoPath, "refs", "heads", fmt.Sprintf("branch-%d", i))
+				require.NoError(t, os.WriteFile(looseRefPath, looseRefContent, 0o644))
+			}
+
+			didRepack, err := packRefsIfNeeded(ctx, repo)
+			require.NoError(t, err)
+			require.False(t, didRepack)
+
+			// Now we create the additional loose ref that causes us to hit the
+			// boundary. We should thus see that we want to repack now.
+			looseRefPath := filepath.Join(repoPath, "refs", "heads", "last-branch")
+			require.NoError(t, os.WriteFile(looseRefPath, looseRefContent, 0o644))
+
+			didRepack, err = packRefsIfNeeded(ctx, repo)
+			require.NoError(t, err)
+			require.True(t, didRepack)
+		})
+	}
+}
+
 func TestEstimateLooseObjectCount(t *testing.T) {
 	t.Parallel()
author	Patrick Steinhardt <psteinhardt@gitlab.com>	2022-02-03 16:54:28 +0300
committer	Patrick Steinhardt <psteinhardt@gitlab.com>	2022-02-10 11:03:42 +0300
commit	7b82783b7bd75040708f8a9c01c14f2ac9516698 (patch)
tree	6069bd7a00cb792884431071305803cee3cdc1d6
parent	ef53a8faf52c348009eb87728be2f6ff79e876a1 (diff)