repository: Use heuristic for full repacks in OptimizeRepository

Right now we're only repacking repositories in OptimizeRepository when they're missing auxiliary caches like bitmaps or commit-graphs with bloom filters. While this has been helpful in rolling out the generation of these caches, ultimately we're advancing to a state where every repo should have both of them. As a result, calling OptimizeRepsiotory is becoming more and more of a no-op over time. What we want is for this RPC to not only repack repositories if they're lacking crucial caches, but also when we can determine that they're in an inefficient state. Introduce a new heuristic which determines whether a repository needs a full repack. A full repack will collapse all the packfiles we have into a single packfile, which allows Git to look up objects without having to search through all packfiles. Furthermore, it allows us to deltify more objects because deltas can only be created for objects which reside in the same packfile. Doing a full repack of the repository scales with the number of objects which exist in the repository and with their total size: the bigger the repository is, the longer it takes. As a consequence, we need to become more careful with repacking the repository as it grows given that we'd otherwise be repacking it all the time. To address this we use a heuristic which estimates the repository's size based on the biggest packfile that exists in the repository. While this is not an accurate reflection of repository size, it is fast to compute. From here on we use a logarithmic function to grow the boundary of how many packfiles need to exist before we do a full repack. This has the effect that we're initially scaling up the number of allowed packfiles quite fast, but decellerate when we approach repositories where the packfiles range in the gigabytes. This heuristic is imperfect by necessity, but it should give us a first good iteration of this. Note that this is only a stop-gap solution anyway: eventually, we want to migrate away from the full/incremental repack split and always use multi-pack indices and geometric repacking, which solves this problem of huge repositories in a much saner way. Changelog: changed
author: Patrick Steinhardt <psteinhardt@gitlab.com> 2022-02-07 12:05:17 +0300
committer: Patrick Steinhardt <psteinhardt@gitlab.com> 2022-02-10 10:44:03 +0300
commit: fb336f03024d99927804cc3781a55996ccd817ad (patch)
tree: 27990302272591a1b177fe84fa4d3df5a2509454
parent: bdddc1e4f9011068d14883a3377525fef0e9141b (diff)
2 files changed, 183 insertions, 0 deletions
diff --git a/internal/gitaly/service/repository/optimize.go b/internal/gitaly/service/repository/optimize.go
index 8a4829da1..1d5a84202 100644
--- a/internal/gitaly/service/repository/optimize.go
+++ b/internal/gitaly/service/repository/optimize.go
@@ -2,8 +2,12 @@ package repository
 
 import (
 	"context"
+	"errors"
 	"fmt"
+	"math"
 	"os"
+	"path/filepath"
+	"strings"
 
 	"github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus"
 	"gitlab.com/gitlab-org/gitaly/v14/internal/git/housekeeping"
@@ -138,5 +142,86 @@ func needsRepacking(repo *localrepo.Repo) (bool, repackCommandConfig, error) {
 		}, nil
 	}
 
+	largestPackfileSize, packfileCount, err := packfileSizeAndCount(repo)
+	if err != nil {
+		return false, repackCommandConfig{}, fmt.Errorf("checking largest packfile size: %w", err)
+	}
+
+	// Whenever we do an incremental repack we create a new packfile, and as a result Git may
+	// have to look into every one of the packfiles to find objects. This is less efficient the
+	// more packfiles we have, but we cannot repack the whole repository every time either given
+	// that this may take a lot of time.
+	//
+	// Instead, we determine whether the repository has "too many" packfiles. "Too many" is
+	// relative though: for small repositories it's fine to do full repacks regularly, but for
+	// large repositories we need to be more careful. We thus use a heuristic of "repository
+	// largeness": we take the biggest packfile that exists, and then the maximum allowed number
+	// of packfiles is `log(largestpackfile_size_in_mb) / log(1.3)`. This gives the following
+	// allowed number of packfiles:
+	//
+	// - No packfile: 5 packfile. This is a special case.
+	// - 10MB packfile: 8 packfiles.
+	// - 100MB packfile: 17 packfiles.
+	// - 500MB packfile: 23 packfiles.
+	// - 1GB packfile: 26 packfiles.
+	// - 5GB packfile: 32 packfiles.
+	// - 10GB packfile: 35 packfiles.
+	// - 100GB packfile: 43 packfiles.
+	//
+	// The goal is to have a comparatively quick ramp-up of allowed packfiles as the repository
+	// size grows, but then slow down such that we're effectively capped and don't end up with
+	// an excessive amount of packfiles.
+	//
+	// This is a heuristic and thus imperfect by necessity. We may tune it as we gain experience
+	// with the way it behaves.
+	if int64(math.Max(5, math.Log(float64(largestPackfileSize))/math.Log(1.3))) < packfileCount {
+		return true, repackCommandConfig{
+			fullRepack:  true,
+			writeBitmap: !hasAlternate,
+		}, nil
+	}
+
 	return false, repackCommandConfig{}, nil
 }
+
+func packfileSizeAndCount(repo *localrepo.Repo) (int64, int64, error) {
+	repoPath, err := repo.Path()
+	if err != nil {
+		return 0, 0, fmt.Errorf("getting repository path: %w", err)
+	}
+
+	entries, err := os.ReadDir(filepath.Join(repoPath, "objects/pack"))
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return 0, 0, nil
+		}
+
+		return 0, 0, err
+	}
+
+	largestSize := int64(0)
+	count := int64(0)
+
+	for _, entry := range entries {
+		if !strings.HasSuffix(entry.Name(), ".pack") {
+			continue
+		}
+
+		entryInfo, err := entry.Info()
+		if err != nil {
+			if errors.Is(err, os.ErrNotExist) {
+				continue
+			}
+
+			return 0, 0, fmt.Errorf("getting packfile info: %w", err)
+		}
+
+		if entryInfo.Size() > largestSize {
+			largestSize = entryInfo.Size()
+		}
+
+		count++
+	}
+
+	return largestSize / 1024 / 1024, count, nil
+}
diff --git a/internal/gitaly/service/repository/optimize_test.go b/internal/gitaly/service/repository/optimize_test.go
index 59d1f4e35..200f67eae 100644
--- a/internal/gitaly/service/repository/optimize_test.go
+++ b/internal/gitaly/service/repository/optimize_test.go
@@ -2,6 +2,8 @@ package repository
 
 import (
 	"bytes"
+	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"testing"
@@ -188,6 +190,15 @@ func TestOptimizeRepositoryValidation(t *testing.T) {
 	require.NoError(t, err)
 }
 
+type infiniteReader struct{}
+
+func (r infiniteReader) Read(b []byte) (int, error) {
+	for i := range b {
+		b[i] = '\000'
+	}
+	return len(b), nil
+}
+
 func TestNeedsRepacking(t *testing.T) {
 	t.Parallel()
 
@@ -313,4 +324,91 @@ func TestNeedsRepacking(t *testing.T) {
 			require.Equal(t, tc.expectedConfig, repackCfg)
 		})
 	}
+
+	const megaByte = 1024 * 1024
+
+	for _, tc := range []struct {
+		packfileSize      int64
+		requiredPackfiles int
+	}{
+		{
+			packfileSize:      1,
+			requiredPackfiles: 5,
+		},
+		{
+			packfileSize:      5 * megaByte,
+			requiredPackfiles: 6,
+		},
+		{
+			packfileSize:      10 * megaByte,
+			requiredPackfiles: 8,
+		},
+		{
+			packfileSize:      50 * megaByte,
+			requiredPackfiles: 14,
+		},
+		{
+			packfileSize:      100 * megaByte,
+			requiredPackfiles: 17,
+		},
+		{
+			packfileSize:      500 * megaByte,
+			requiredPackfiles: 23,
+		},
+		{
+			packfileSize:      1000 * megaByte,
+			requiredPackfiles: 26,
+		},
+		// Let's not go any further than this, we're thrashing the temporary directory.
+	} {
+		t.Run(fmt.Sprintf("packfile with %d bytes", tc.packfileSize), func(t *testing.T) {
+			repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg)
+			repo := localrepo.NewTestRepo(t, cfg, repoProto)
+			packDir := filepath.Join(repoPath, "objects", "pack")
+
+			// Emulate the existence of a bitmap and a commit-graph with bloom filters.
+			// We explicitly don't want to generate them via Git commands as they would
+			// require us to already have objects in the repository, and we want to be
+			// in full control over all objects and packfiles in the repo.
+			require.NoError(t, os.WriteFile(filepath.Join(packDir, "something.bitmap"), nil, 0o644))
+			commitGraphChainPath := filepath.Join(repoPath, stats.CommitGraphChainRelPath)
+			require.NoError(t, os.MkdirAll(filepath.Dir(commitGraphChainPath), 0o755))
+			require.NoError(t, os.WriteFile(commitGraphChainPath, nil, 0o644))
+
+			// We first create a single big packfile which is used to determine the
+			// boundary of when we repack.
+			bigPackfile, err := os.OpenFile(filepath.Join(packDir, "big.pack"), os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644)
+			require.NoError(t, err)
+			defer testhelper.MustClose(t, bigPackfile)
+			_, err = io.Copy(bigPackfile, io.LimitReader(infiniteReader{}, tc.packfileSize))
+			require.NoError(t, err)
+
+			// And then we create one less packfile than we need to hit the boundary.
+			// This is done to assert that we indeed don't repack before hitting the
+			// boundary.
+			for i := 0; i < tc.requiredPackfiles-1; i++ {
+				additionalPackfile, err := os.Create(filepath.Join(packDir, fmt.Sprintf("%d.pack", i)))
+				require.NoError(t, err)
+				testhelper.MustClose(t, additionalPackfile)
+			}
+
+			repackNeeded, _, err := needsRepacking(repo)
+			require.NoError(t, err)
+			require.False(t, repackNeeded)
+
+			// Now we create the additional packfile that causes us to hit the boundary.
+			// We should thus see that we want to repack now.
+			lastPackfile, err := os.Create(filepath.Join(packDir, "last.pack"))
+			require.NoError(t, err)
+			testhelper.MustClose(t, lastPackfile)
+
+			repackNeeded, repackCfg, err := needsRepacking(repo)
+			require.NoError(t, err)
+			require.True(t, repackNeeded)
+			require.Equal(t, repackCommandConfig{
+				fullRepack:  true,
+				writeBitmap: true,
+			}, repackCfg)
+		})
+	}
 }
author	Patrick Steinhardt <psteinhardt@gitlab.com>	2022-02-07 12:05:17 +0300
committer	Patrick Steinhardt <psteinhardt@gitlab.com>	2022-02-10 10:44:03 +0300
commit	fb336f03024d99927804cc3781a55996ccd817ad (patch)
tree	27990302272591a1b177fe84fa4d3df5a2509454
parent	bdddc1e4f9011068d14883a3377525fef0e9141b (diff)