diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-11-25 14:03:48 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-11-30 17:46:13 +0300 |
commit | 5fbec7e59bf37c6696ffb19819aa15f4e4087534 (patch) | |
tree | a62a0b8c7e1f693cddab68554e82236d05efe49b | |
parent | feee635a4481c884f1aae8093487360772d7c61c (diff) |
housekeeping: Move `CountLooseObjects` into `git/stats`
Move `CountLooseObjects` into the `git/stats` package. This will be used
so that we can derive information about loose objects without having to
spawn git-count-objects(1).
-rw-r--r-- | internal/git/housekeeping/optimization_strategy.go | 59 | ||||
-rw-r--r-- | internal/git/housekeeping/optimization_strategy_test.go | 203 | ||||
-rw-r--r-- | internal/git/stats/objects_info.go | 55 | ||||
-rw-r--r-- | internal/git/stats/objects_info_test.go | 207 |
4 files changed, 264 insertions, 260 deletions
diff --git a/internal/git/housekeeping/optimization_strategy.go b/internal/git/housekeeping/optimization_strategy.go index 8d26aff13..8b331b0f3 100644 --- a/internal/git/housekeeping/optimization_strategy.go +++ b/internal/git/housekeeping/optimization_strategy.go @@ -8,7 +8,6 @@ import ( "math" "os" "path/filepath" - "strings" "time" "gitlab.com/gitlab-org/gitaly/v15/internal/git/localrepo" @@ -98,12 +97,12 @@ func NewHeuristicalOptimizationStrategy(ctx context.Context, repo *localrepo.Rep strategy.packfileCount = packfilesInfo.Count strategy.packfileSize = packfilesInfo.Size - strategy.looseObjectCount, err = countLooseObjects(repo, time.Now()) + strategy.looseObjectCount, err = stats.CountLooseObjects(repo, time.Now()) if err != nil { return strategy, fmt.Errorf("estimating loose object count: %w", err) } - strategy.oldLooseObjectCount, err = countLooseObjects(repo, time.Now().Add(CutOffTime)) + strategy.oldLooseObjectCount, err = stats.CountLooseObjects(repo, time.Now().Add(CutOffTime)) if err != nil { return strategy, fmt.Errorf("estimating old loose object count: %w", err) } @@ -211,60 +210,6 @@ func (s HeuristicalOptimizationStrategy) ShouldRepackObjects() (bool, RepackObje return false, RepackObjectsConfig{} } -// countLooseObjects counts the number of loose objects in the repository. If a cutoff date is -// given, then this function will only take into account objects which are older than the given -// point in time. -func countLooseObjects(repo *localrepo.Repo, cutoffDate time.Time) (uint64, error) { - repoPath, err := repo.Path() - if err != nil { - return 0, fmt.Errorf("getting repository path: %w", err) - } - - var looseObjects uint64 - for i := 0; i <= 0xFF; i++ { - entries, err := os.ReadDir(filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i))) - if err != nil { - if errors.Is(err, os.ErrNotExist) { - continue - } - - return 0, fmt.Errorf("reading loose object shard: %w", err) - } - - for _, entry := range entries { - if !isValidLooseObjectName(entry.Name()) { - continue - } - - entryInfo, err := entry.Info() - if err != nil { - if errors.Is(err, fs.ErrNotExist) { - continue - } - - return 0, fmt.Errorf("reading object info: %w", err) - } - - if entryInfo.ModTime().After(cutoffDate) { - continue - } - - looseObjects++ - } - } - - return looseObjects, nil -} - -func isValidLooseObjectName(s string) bool { - for _, c := range []byte(s) { - if strings.IndexByte("0123456789abcdef", c) < 0 { - return false - } - } - return true -} - // ShouldWriteCommitGraph determines whether we need to write the commit-graph and how it should be // written. func (s HeuristicalOptimizationStrategy) ShouldWriteCommitGraph() (bool, WriteCommitGraphConfig) { diff --git a/internal/git/housekeeping/optimization_strategy_test.go b/internal/git/housekeeping/optimization_strategy_test.go index e03aa65ce..9db8a7439 100644 --- a/internal/git/housekeeping/optimization_strategy_test.go +++ b/internal/git/housekeeping/optimization_strategy_test.go @@ -4,7 +4,6 @@ package housekeeping import ( "fmt" - "math" "os" "path/filepath" "testing" @@ -679,208 +678,6 @@ func TestHeuristicalOptimizationStrategy_NeedsWriteCommitGraph(t *testing.T) { } } -func TestCountLooseObjects(t *testing.T) { - t.Parallel() - - ctx := testhelper.Context(t) - cfg := testcfg.Build(t) - - createRepo := func(t *testing.T) (*localrepo.Repo, string) { - repoProto, repoPath := gittest.CreateRepository(t, ctx, cfg, gittest.CreateRepositoryConfig{ - SkipCreationViaService: true, - }) - return localrepo.NewTestRepo(t, cfg, repoProto), repoPath - } - - t.Run("empty repository", func(t *testing.T) { - repo, _ := createRepo(t) - - looseObjects, err := countLooseObjects(repo, time.Now()) - require.NoError(t, err) - require.Zero(t, looseObjects) - }) - - t.Run("object in random shard", func(t *testing.T) { - repo, repoPath := createRepo(t) - - differentShard := filepath.Join(repoPath, "objects", "a0") - require.NoError(t, os.MkdirAll(differentShard, 0o755)) - - object, err := os.Create(filepath.Join(differentShard, "123456")) - require.NoError(t, err) - testhelper.MustClose(t, object) - - looseObjects, err := countLooseObjects(repo, time.Now()) - require.NoError(t, err) - require.EqualValues(t, 1, looseObjects) - }) - - t.Run("objects in multiple shards", func(t *testing.T) { - repo, repoPath := createRepo(t) - - for _, shard := range []string{"00", "17", "32", "ff"} { - shardPath := filepath.Join(repoPath, "objects", shard) - require.NoError(t, os.MkdirAll(shardPath, 0o755)) - - object, err := os.Create(filepath.Join(shardPath, "123456")) - require.NoError(t, err) - testhelper.MustClose(t, object) - } - - looseObjects, err := countLooseObjects(repo, time.Now()) - require.NoError(t, err) - require.EqualValues(t, 4, looseObjects) - }) - - t.Run("object in shard with grace period", func(t *testing.T) { - repo, repoPath := createRepo(t) - - shard := filepath.Join(repoPath, "objects", "17") - require.NoError(t, os.MkdirAll(shard, 0o755)) - - objectPaths := []string{ - filepath.Join(shard, "123456"), - filepath.Join(shard, "654321"), - } - - cutoffDate := time.Now() - afterCutoffDate := cutoffDate.Add(1 * time.Minute) - beforeCutoffDate := cutoffDate.Add(-1 * time.Minute) - - for _, objectPath := range objectPaths { - require.NoError(t, os.WriteFile(objectPath, nil, 0o644)) - require.NoError(t, os.Chtimes(objectPath, afterCutoffDate, afterCutoffDate)) - } - - // Objects are recent, so with the cutoff-date they shouldn't be counted. - looseObjects, err := countLooseObjects(repo, cutoffDate) - require.NoError(t, err) - require.EqualValues(t, 0, looseObjects) - - for i, objectPath := range objectPaths { - // Modify the object's mtime should cause it to be counted. - require.NoError(t, os.Chtimes(objectPath, beforeCutoffDate, beforeCutoffDate)) - - looseObjects, err = countLooseObjects(repo, cutoffDate) - require.NoError(t, err) - require.EqualValues(t, i+1, looseObjects) - } - }) - - t.Run("shard with garbage", func(t *testing.T) { - repo, repoPath := createRepo(t) - - shard := filepath.Join(repoPath, "objects", "17") - require.NoError(t, os.MkdirAll(shard, 0o755)) - - for _, objectName := range []string{"garbage", "012345"} { - require.NoError(t, os.WriteFile(filepath.Join(shard, objectName), nil, 0o644)) - } - - looseObjects, err := countLooseObjects(repo, time.Now()) - require.NoError(t, err) - require.EqualValues(t, 1, looseObjects) - }) -} - -func BenchmarkCountLooseObjects(b *testing.B) { - ctx := testhelper.Context(b) - cfg := testcfg.Build(b) - - createRepo := func(b *testing.B) (*localrepo.Repo, string) { - repoProto, repoPath := gittest.CreateRepository(b, ctx, cfg, gittest.CreateRepositoryConfig{ - SkipCreationViaService: true, - }) - return localrepo.NewTestRepo(b, cfg, repoProto), repoPath - } - - b.Run("empty repository", func(b *testing.B) { - repo, _ := createRepo(b) - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) - - b.Run("repository with single object", func(b *testing.B) { - repo, repoPath := createRepo(b) - - objectPath := filepath.Join(repoPath, "objects", "17", "12345") - require.NoError(b, os.Mkdir(filepath.Dir(objectPath), 0o755)) - require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) - - b.Run("repository with single object in each shard", func(b *testing.B) { - repo, repoPath := createRepo(b) - - for i := 0; i < 256; i++ { - objectPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i), "12345") - require.NoError(b, os.Mkdir(filepath.Dir(objectPath), 0o755)) - require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) - - b.Run("repository hitting loose object limit", func(b *testing.B) { - repo, repoPath := createRepo(b) - - // Usually we shouldn't have a lot more than `looseObjectCount` objects in the - // repository because we'd repack as soon as we hit that limit. So this benchmark - // case tries to estimate the usual upper limit for loose objects we'd typically - // have. - looseObjectCount := int(math.Ceil(looseObjectLimit / 256)) - - for i := 0; i < 256; i++ { - shardPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i)) - require.NoError(b, os.Mkdir(shardPath, 0o755)) - - for j := 0; j < looseObjectCount; j++ { - objectPath := filepath.Join(shardPath, fmt.Sprintf("%d", j)) - require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) - } - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) - - b.Run("repository with lots of objects", func(b *testing.B) { - repo, repoPath := createRepo(b) - - for i := 0; i < 256; i++ { - shardPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i)) - require.NoError(b, os.Mkdir(shardPath, 0o755)) - - for j := 0; j < 1000; j++ { - objectPath := filepath.Join(shardPath, fmt.Sprintf("%d", j)) - require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) - } - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) -} - func TestNewEagerOptimizationStrategy(t *testing.T) { t.Parallel() diff --git a/internal/git/stats/objects_info.go b/internal/git/stats/objects_info.go index eedd8316d..db9a54d7f 100644 --- a/internal/git/stats/objects_info.go +++ b/internal/git/stats/objects_info.go @@ -10,6 +10,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" "gitlab.com/gitlab-org/gitaly/v15/internal/git" @@ -189,6 +190,60 @@ func ObjectsInfoForRepository(ctx context.Context, repo *localrepo.Repo) (Object return info, nil } +// CountLooseObjects counts the number of loose objects in the repository. If a cutoff date is +// given, then this function will only take into account objects which are older than the given +// point in time. +func CountLooseObjects(repo *localrepo.Repo, cutoffDate time.Time) (uint64, error) { + repoPath, err := repo.Path() + if err != nil { + return 0, fmt.Errorf("getting repository path: %w", err) + } + + var looseObjects uint64 + for i := 0; i <= 0xFF; i++ { + entries, err := os.ReadDir(filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i))) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + continue + } + + return 0, fmt.Errorf("reading loose object shard: %w", err) + } + + for _, entry := range entries { + if !isValidLooseObjectName(entry.Name()) { + continue + } + + entryInfo, err := entry.Info() + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + continue + } + + return 0, fmt.Errorf("reading object info: %w", err) + } + + if entryInfo.ModTime().After(cutoffDate) { + continue + } + + looseObjects++ + } + } + + return looseObjects, nil +} + +func isValidLooseObjectName(s string) bool { + for _, c := range []byte(s) { + if strings.IndexByte("0123456789abcdef", c) < 0 { + return false + } + } + return true +} + // PackfilesInfo contains information about packfiles. type PackfilesInfo struct { // Count is the number of loose objects, including stale ones. diff --git a/internal/git/stats/objects_info_test.go b/internal/git/stats/objects_info_test.go index 85404477b..1d8873fbe 100644 --- a/internal/git/stats/objects_info_test.go +++ b/internal/git/stats/objects_info_test.go @@ -1,6 +1,7 @@ package stats import ( + "fmt" "os" "path/filepath" "testing" @@ -326,6 +327,212 @@ func TestObjectsInfoForRepository(t *testing.T) { } } +func TestCountLooseObjects(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + cfg := testcfg.Build(t) + + createRepo := func(t *testing.T) (*localrepo.Repo, string) { + repoProto, repoPath := gittest.CreateRepository(t, ctx, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + return localrepo.NewTestRepo(t, cfg, repoProto), repoPath + } + + t.Run("empty repository", func(t *testing.T) { + repo, _ := createRepo(t) + + looseObjects, err := CountLooseObjects(repo, time.Now()) + require.NoError(t, err) + require.Zero(t, looseObjects) + }) + + t.Run("object in random shard", func(t *testing.T) { + repo, repoPath := createRepo(t) + + differentShard := filepath.Join(repoPath, "objects", "a0") + require.NoError(t, os.MkdirAll(differentShard, 0o755)) + + object, err := os.Create(filepath.Join(differentShard, "123456")) + require.NoError(t, err) + testhelper.MustClose(t, object) + + looseObjects, err := CountLooseObjects(repo, time.Now()) + require.NoError(t, err) + require.EqualValues(t, 1, looseObjects) + }) + + t.Run("objects in multiple shards", func(t *testing.T) { + repo, repoPath := createRepo(t) + + for _, shard := range []string{"00", "17", "32", "ff"} { + shardPath := filepath.Join(repoPath, "objects", shard) + require.NoError(t, os.MkdirAll(shardPath, 0o755)) + + object, err := os.Create(filepath.Join(shardPath, "123456")) + require.NoError(t, err) + testhelper.MustClose(t, object) + } + + looseObjects, err := CountLooseObjects(repo, time.Now()) + require.NoError(t, err) + require.EqualValues(t, 4, looseObjects) + }) + + t.Run("object in shard with grace period", func(t *testing.T) { + repo, repoPath := createRepo(t) + + shard := filepath.Join(repoPath, "objects", "17") + require.NoError(t, os.MkdirAll(shard, 0o755)) + + objectPaths := []string{ + filepath.Join(shard, "123456"), + filepath.Join(shard, "654321"), + } + + cutoffDate := time.Now() + afterCutoffDate := cutoffDate.Add(1 * time.Minute) + beforeCutoffDate := cutoffDate.Add(-1 * time.Minute) + + for _, objectPath := range objectPaths { + require.NoError(t, os.WriteFile(objectPath, nil, 0o644)) + require.NoError(t, os.Chtimes(objectPath, afterCutoffDate, afterCutoffDate)) + } + + // Objects are recent, so with the cutoff-date they shouldn't be counted. + looseObjects, err := CountLooseObjects(repo, cutoffDate) + require.NoError(t, err) + require.EqualValues(t, 0, looseObjects) + + for i, objectPath := range objectPaths { + // Modify the object's mtime should cause it to be counted. + require.NoError(t, os.Chtimes(objectPath, beforeCutoffDate, beforeCutoffDate)) + + looseObjects, err = CountLooseObjects(repo, cutoffDate) + require.NoError(t, err) + require.EqualValues(t, i+1, looseObjects) + } + }) + + t.Run("shard with garbage", func(t *testing.T) { + repo, repoPath := createRepo(t) + + shard := filepath.Join(repoPath, "objects", "17") + require.NoError(t, os.MkdirAll(shard, 0o755)) + + for _, objectName := range []string{"garbage", "012345"} { + require.NoError(t, os.WriteFile(filepath.Join(shard, objectName), nil, 0o644)) + } + + looseObjects, err := CountLooseObjects(repo, time.Now()) + require.NoError(t, err) + require.EqualValues(t, 1, looseObjects) + }) +} + +func BenchmarkCountLooseObjects(b *testing.B) { + ctx := testhelper.Context(b) + cfg := testcfg.Build(b) + + createRepo := func(b *testing.B) (*localrepo.Repo, string) { + repoProto, repoPath := gittest.CreateRepository(b, ctx, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + return localrepo.NewTestRepo(b, cfg, repoProto), repoPath + } + + b.Run("empty repository", func(b *testing.B) { + repo, _ := createRepo(b) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := CountLooseObjects(repo, time.Now()) + require.NoError(b, err) + } + }) + + b.Run("repository with single object", func(b *testing.B) { + repo, repoPath := createRepo(b) + + objectPath := filepath.Join(repoPath, "objects", "17", "12345") + require.NoError(b, os.Mkdir(filepath.Dir(objectPath), 0o755)) + require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := CountLooseObjects(repo, time.Now()) + require.NoError(b, err) + } + }) + + b.Run("repository with single object in each shard", func(b *testing.B) { + repo, repoPath := createRepo(b) + + for i := 0; i < 256; i++ { + objectPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i), "12345") + require.NoError(b, os.Mkdir(filepath.Dir(objectPath), 0o755)) + require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := CountLooseObjects(repo, time.Now()) + require.NoError(b, err) + } + }) + + b.Run("repository hitting loose object limit", func(b *testing.B) { + repo, repoPath := createRepo(b) + + // Usually we shouldn't have a lot more than `looseObjectCount` objects in the + // repository because we'd repack as soon as we hit that limit. So this benchmark + // case tries to estimate the usual upper limit for loose objects we'd typically + // have. + // + // Note that we should ideally just use `housekeeping.looseObjectsLimit` here to + // derive that value. But due to a cyclic dependency that's not possible, so we + // just use a hard-coded value instead. + looseObjectCount := 5 + + for i := 0; i < 256; i++ { + shardPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i)) + require.NoError(b, os.Mkdir(shardPath, 0o755)) + + for j := 0; j < looseObjectCount; j++ { + objectPath := filepath.Join(shardPath, fmt.Sprintf("%d", j)) + require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) + } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := CountLooseObjects(repo, time.Now()) + require.NoError(b, err) + } + }) + + b.Run("repository with lots of objects", func(b *testing.B) { + repo, repoPath := createRepo(b) + + for i := 0; i < 256; i++ { + shardPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i)) + require.NoError(b, os.Mkdir(shardPath, 0o755)) + + for j := 0; j < 1000; j++ { + objectPath := filepath.Join(shardPath, fmt.Sprintf("%d", j)) + require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) + } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := CountLooseObjects(repo, time.Now()) + require.NoError(b, err) + } + }) +} + func TestPackfileInfoForRepository(t *testing.T) { t.Parallel() |