diff options
author | James Fargher <proglottis@gmail.com> | 2022-12-01 03:05:16 +0300 |
---|---|---|
committer | James Fargher <proglottis@gmail.com> | 2022-12-01 03:05:16 +0300 |
commit | 6b4c39afc0b2a086c142b4926baeed17e8db1ff8 (patch) | |
tree | b355d6b999d48c6d35bc3b6caa1d9745983f7809 | |
parent | 3b40eea71155dc6c51c9e451361c460bedd57b30 (diff) | |
parent | 14932a2e7a491e5881becd668ed4e7eed59ce7b0 (diff) |
Merge branch 'pks-git-stats-loose-objects-info' into 'master'
git/stats: Extend statistics about loose objects
See merge request https://gitlab.com/gitlab-org/gitaly/-/merge_requests/5103
Merged-by: James Fargher <proglottis@gmail.com>
Approved-by: James Fargher <proglottis@gmail.com>
Co-authored-by: Patrick Steinhardt <psteinhardt@gitlab.com>
-rw-r--r-- | internal/git/housekeeping/optimization_strategy.go | 64 | ||||
-rw-r--r-- | internal/git/housekeeping/optimization_strategy_test.go | 203 | ||||
-rw-r--r-- | internal/git/stats/objects_info.go | 79 | ||||
-rw-r--r-- | internal/git/stats/objects_info_test.go | 218 |
4 files changed, 300 insertions, 264 deletions
diff --git a/internal/git/housekeeping/optimization_strategy.go b/internal/git/housekeeping/optimization_strategy.go index 8d26aff13..c877e00f5 100644 --- a/internal/git/housekeeping/optimization_strategy.go +++ b/internal/git/housekeeping/optimization_strategy.go @@ -8,7 +8,6 @@ import ( "math" "os" "path/filepath" - "strings" "time" "gitlab.com/gitlab-org/gitaly/v15/internal/git/localrepo" @@ -98,15 +97,12 @@ func NewHeuristicalOptimizationStrategy(ctx context.Context, repo *localrepo.Rep strategy.packfileCount = packfilesInfo.Count strategy.packfileSize = packfilesInfo.Size - strategy.looseObjectCount, err = countLooseObjects(repo, time.Now()) + looseObjectsInfo, err := stats.LooseObjectsInfoForRepository(repo, time.Now().Add(CutOffTime)) if err != nil { return strategy, fmt.Errorf("estimating loose object count: %w", err) } - - strategy.oldLooseObjectCount, err = countLooseObjects(repo, time.Now().Add(CutOffTime)) - if err != nil { - return strategy, fmt.Errorf("estimating old loose object count: %w", err) - } + strategy.looseObjectCount = looseObjectsInfo.Count + strategy.oldLooseObjectCount = looseObjectsInfo.StaleCount strategy.looseRefsCount, strategy.packedRefsSize, err = countLooseAndPackedRefs(ctx, repo) if err != nil { @@ -211,60 +207,6 @@ func (s HeuristicalOptimizationStrategy) ShouldRepackObjects() (bool, RepackObje return false, RepackObjectsConfig{} } -// countLooseObjects counts the number of loose objects in the repository. If a cutoff date is -// given, then this function will only take into account objects which are older than the given -// point in time. -func countLooseObjects(repo *localrepo.Repo, cutoffDate time.Time) (uint64, error) { - repoPath, err := repo.Path() - if err != nil { - return 0, fmt.Errorf("getting repository path: %w", err) - } - - var looseObjects uint64 - for i := 0; i <= 0xFF; i++ { - entries, err := os.ReadDir(filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i))) - if err != nil { - if errors.Is(err, os.ErrNotExist) { - continue - } - - return 0, fmt.Errorf("reading loose object shard: %w", err) - } - - for _, entry := range entries { - if !isValidLooseObjectName(entry.Name()) { - continue - } - - entryInfo, err := entry.Info() - if err != nil { - if errors.Is(err, fs.ErrNotExist) { - continue - } - - return 0, fmt.Errorf("reading object info: %w", err) - } - - if entryInfo.ModTime().After(cutoffDate) { - continue - } - - looseObjects++ - } - } - - return looseObjects, nil -} - -func isValidLooseObjectName(s string) bool { - for _, c := range []byte(s) { - if strings.IndexByte("0123456789abcdef", c) < 0 { - return false - } - } - return true -} - // ShouldWriteCommitGraph determines whether we need to write the commit-graph and how it should be // written. func (s HeuristicalOptimizationStrategy) ShouldWriteCommitGraph() (bool, WriteCommitGraphConfig) { diff --git a/internal/git/housekeeping/optimization_strategy_test.go b/internal/git/housekeeping/optimization_strategy_test.go index e03aa65ce..9db8a7439 100644 --- a/internal/git/housekeeping/optimization_strategy_test.go +++ b/internal/git/housekeeping/optimization_strategy_test.go @@ -4,7 +4,6 @@ package housekeeping import ( "fmt" - "math" "os" "path/filepath" "testing" @@ -679,208 +678,6 @@ func TestHeuristicalOptimizationStrategy_NeedsWriteCommitGraph(t *testing.T) { } } -func TestCountLooseObjects(t *testing.T) { - t.Parallel() - - ctx := testhelper.Context(t) - cfg := testcfg.Build(t) - - createRepo := func(t *testing.T) (*localrepo.Repo, string) { - repoProto, repoPath := gittest.CreateRepository(t, ctx, cfg, gittest.CreateRepositoryConfig{ - SkipCreationViaService: true, - }) - return localrepo.NewTestRepo(t, cfg, repoProto), repoPath - } - - t.Run("empty repository", func(t *testing.T) { - repo, _ := createRepo(t) - - looseObjects, err := countLooseObjects(repo, time.Now()) - require.NoError(t, err) - require.Zero(t, looseObjects) - }) - - t.Run("object in random shard", func(t *testing.T) { - repo, repoPath := createRepo(t) - - differentShard := filepath.Join(repoPath, "objects", "a0") - require.NoError(t, os.MkdirAll(differentShard, 0o755)) - - object, err := os.Create(filepath.Join(differentShard, "123456")) - require.NoError(t, err) - testhelper.MustClose(t, object) - - looseObjects, err := countLooseObjects(repo, time.Now()) - require.NoError(t, err) - require.EqualValues(t, 1, looseObjects) - }) - - t.Run("objects in multiple shards", func(t *testing.T) { - repo, repoPath := createRepo(t) - - for _, shard := range []string{"00", "17", "32", "ff"} { - shardPath := filepath.Join(repoPath, "objects", shard) - require.NoError(t, os.MkdirAll(shardPath, 0o755)) - - object, err := os.Create(filepath.Join(shardPath, "123456")) - require.NoError(t, err) - testhelper.MustClose(t, object) - } - - looseObjects, err := countLooseObjects(repo, time.Now()) - require.NoError(t, err) - require.EqualValues(t, 4, looseObjects) - }) - - t.Run("object in shard with grace period", func(t *testing.T) { - repo, repoPath := createRepo(t) - - shard := filepath.Join(repoPath, "objects", "17") - require.NoError(t, os.MkdirAll(shard, 0o755)) - - objectPaths := []string{ - filepath.Join(shard, "123456"), - filepath.Join(shard, "654321"), - } - - cutoffDate := time.Now() - afterCutoffDate := cutoffDate.Add(1 * time.Minute) - beforeCutoffDate := cutoffDate.Add(-1 * time.Minute) - - for _, objectPath := range objectPaths { - require.NoError(t, os.WriteFile(objectPath, nil, 0o644)) - require.NoError(t, os.Chtimes(objectPath, afterCutoffDate, afterCutoffDate)) - } - - // Objects are recent, so with the cutoff-date they shouldn't be counted. - looseObjects, err := countLooseObjects(repo, cutoffDate) - require.NoError(t, err) - require.EqualValues(t, 0, looseObjects) - - for i, objectPath := range objectPaths { - // Modify the object's mtime should cause it to be counted. - require.NoError(t, os.Chtimes(objectPath, beforeCutoffDate, beforeCutoffDate)) - - looseObjects, err = countLooseObjects(repo, cutoffDate) - require.NoError(t, err) - require.EqualValues(t, i+1, looseObjects) - } - }) - - t.Run("shard with garbage", func(t *testing.T) { - repo, repoPath := createRepo(t) - - shard := filepath.Join(repoPath, "objects", "17") - require.NoError(t, os.MkdirAll(shard, 0o755)) - - for _, objectName := range []string{"garbage", "012345"} { - require.NoError(t, os.WriteFile(filepath.Join(shard, objectName), nil, 0o644)) - } - - looseObjects, err := countLooseObjects(repo, time.Now()) - require.NoError(t, err) - require.EqualValues(t, 1, looseObjects) - }) -} - -func BenchmarkCountLooseObjects(b *testing.B) { - ctx := testhelper.Context(b) - cfg := testcfg.Build(b) - - createRepo := func(b *testing.B) (*localrepo.Repo, string) { - repoProto, repoPath := gittest.CreateRepository(b, ctx, cfg, gittest.CreateRepositoryConfig{ - SkipCreationViaService: true, - }) - return localrepo.NewTestRepo(b, cfg, repoProto), repoPath - } - - b.Run("empty repository", func(b *testing.B) { - repo, _ := createRepo(b) - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) - - b.Run("repository with single object", func(b *testing.B) { - repo, repoPath := createRepo(b) - - objectPath := filepath.Join(repoPath, "objects", "17", "12345") - require.NoError(b, os.Mkdir(filepath.Dir(objectPath), 0o755)) - require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) - - b.Run("repository with single object in each shard", func(b *testing.B) { - repo, repoPath := createRepo(b) - - for i := 0; i < 256; i++ { - objectPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i), "12345") - require.NoError(b, os.Mkdir(filepath.Dir(objectPath), 0o755)) - require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) - - b.Run("repository hitting loose object limit", func(b *testing.B) { - repo, repoPath := createRepo(b) - - // Usually we shouldn't have a lot more than `looseObjectCount` objects in the - // repository because we'd repack as soon as we hit that limit. So this benchmark - // case tries to estimate the usual upper limit for loose objects we'd typically - // have. - looseObjectCount := int(math.Ceil(looseObjectLimit / 256)) - - for i := 0; i < 256; i++ { - shardPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i)) - require.NoError(b, os.Mkdir(shardPath, 0o755)) - - for j := 0; j < looseObjectCount; j++ { - objectPath := filepath.Join(shardPath, fmt.Sprintf("%d", j)) - require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) - } - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) - - b.Run("repository with lots of objects", func(b *testing.B) { - repo, repoPath := createRepo(b) - - for i := 0; i < 256; i++ { - shardPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i)) - require.NoError(b, os.Mkdir(shardPath, 0o755)) - - for j := 0; j < 1000; j++ { - objectPath := filepath.Join(shardPath, fmt.Sprintf("%d", j)) - require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) - } - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := countLooseObjects(repo, time.Now()) - require.NoError(b, err) - } - }) -} - func TestNewEagerOptimizationStrategy(t *testing.T) { t.Parallel() diff --git a/internal/git/stats/objects_info.go b/internal/git/stats/objects_info.go index eedd8316d..471746e76 100644 --- a/internal/git/stats/objects_info.go +++ b/internal/git/stats/objects_info.go @@ -10,6 +10,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" "gitlab.com/gitlab-org/gitaly/v15/internal/git" @@ -189,6 +190,84 @@ func ObjectsInfoForRepository(ctx context.Context, repo *localrepo.Repo) (Object return info, nil } +// LooseObjectsInfo contains information about loose objects. +type LooseObjectsInfo struct { + // Count is the number of loose objects. + Count uint64 `json:"count"` + // Size is the total size of all loose objects in bytes. + Size uint64 `json:"size"` + // StaleCount is the number of stale loose objects when taking into account the specified cutoff + // date. + StaleCount uint64 `json:"stale_count"` + // StaleSize is the total size of stale loose objects when taking into account the specified + // cutoff date. + StaleSize uint64 `json:"stale_size"` + // GarbageCount is the number of garbage files in the loose-objects shards. + GarbageCount uint64 `json:"garbage_count"` + // GarbageSize is the total size of garbage in the loose-objects shards. + GarbageSize uint64 `json:"garbage_size"` +} + +// LooseObjectsInfoForRepository derives information about loose objects in the repository. If a +// cutoff date is given, then this function will only take into account objects which are older than +// the given point in time. +func LooseObjectsInfoForRepository(repo *localrepo.Repo, cutoffDate time.Time) (LooseObjectsInfo, error) { + repoPath, err := repo.Path() + if err != nil { + return LooseObjectsInfo{}, fmt.Errorf("getting repository path: %w", err) + } + + var info LooseObjectsInfo + for i := 0; i <= 0xFF; i++ { + entries, err := os.ReadDir(filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i))) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + continue + } + + return LooseObjectsInfo{}, fmt.Errorf("reading loose object shard: %w", err) + } + + for _, entry := range entries { + entryInfo, err := entry.Info() + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + continue + } + + return LooseObjectsInfo{}, fmt.Errorf("reading object info: %w", err) + } + + if !isValidLooseObjectName(entry.Name()) { + info.GarbageCount++ + info.GarbageSize += uint64(entryInfo.Size()) + continue + } + + // Note: we don't `continue` here as we count stale objects into the total + // number of objects. + if entryInfo.ModTime().Before(cutoffDate) { + info.StaleCount++ + info.StaleSize += uint64(entryInfo.Size()) + } + + info.Count++ + info.Size += uint64(entryInfo.Size()) + } + } + + return info, nil +} + +func isValidLooseObjectName(s string) bool { + for _, c := range []byte(s) { + if strings.IndexByte("0123456789abcdef", c) < 0 { + return false + } + } + return true +} + // PackfilesInfo contains information about packfiles. type PackfilesInfo struct { // Count is the number of loose objects, including stale ones. diff --git a/internal/git/stats/objects_info_test.go b/internal/git/stats/objects_info_test.go index 85404477b..20c67addf 100644 --- a/internal/git/stats/objects_info_test.go +++ b/internal/git/stats/objects_info_test.go @@ -1,6 +1,7 @@ package stats import ( + "fmt" "os" "path/filepath" "testing" @@ -326,6 +327,223 @@ func TestObjectsInfoForRepository(t *testing.T) { } } +func TestCountLooseObjects(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + cfg := testcfg.Build(t) + + createRepo := func(t *testing.T) (*localrepo.Repo, string) { + repoProto, repoPath := gittest.CreateRepository(t, ctx, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + return localrepo.NewTestRepo(t, cfg, repoProto), repoPath + } + + requireLooseObjectsInfo := func(t *testing.T, repo *localrepo.Repo, cutoff time.Time, expectedInfo LooseObjectsInfo) { + info, err := LooseObjectsInfoForRepository(repo, cutoff) + require.NoError(t, err) + require.Equal(t, expectedInfo, info) + } + + t.Run("empty repository", func(t *testing.T) { + repo, _ := createRepo(t) + requireLooseObjectsInfo(t, repo, time.Now(), LooseObjectsInfo{}) + }) + + t.Run("object in random shard", func(t *testing.T) { + repo, repoPath := createRepo(t) + + differentShard := filepath.Join(repoPath, "objects", "a0") + require.NoError(t, os.MkdirAll(differentShard, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(differentShard, "123456"), []byte("foobar"), 0o644)) + + requireLooseObjectsInfo(t, repo, time.Now(), LooseObjectsInfo{ + Count: 1, + Size: 6, + StaleCount: 1, + StaleSize: 6, + }) + }) + + t.Run("objects in multiple shards", func(t *testing.T) { + repo, repoPath := createRepo(t) + + for i, shard := range []string{"00", "17", "32", "ff"} { + shardPath := filepath.Join(repoPath, "objects", shard) + require.NoError(t, os.MkdirAll(shardPath, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(shardPath, "123456"), make([]byte, i), 0o644)) + } + + requireLooseObjectsInfo(t, repo, time.Now(), LooseObjectsInfo{ + Count: 4, + Size: 6, + StaleCount: 4, + StaleSize: 6, + }) + }) + + t.Run("object in shard with grace period", func(t *testing.T) { + repo, repoPath := createRepo(t) + + shard := filepath.Join(repoPath, "objects", "17") + require.NoError(t, os.MkdirAll(shard, 0o755)) + + objectPaths := []string{ + filepath.Join(shard, "123456"), + filepath.Join(shard, "654321"), + } + + cutoffDate := time.Now() + afterCutoffDate := cutoffDate.Add(1 * time.Minute) + beforeCutoffDate := cutoffDate.Add(-1 * time.Minute) + + for _, objectPath := range objectPaths { + require.NoError(t, os.WriteFile(objectPath, []byte("1"), 0o644)) + require.NoError(t, os.Chtimes(objectPath, afterCutoffDate, afterCutoffDate)) + } + + // Objects are recent, so with the cutoff-date they shouldn't be counted. + requireLooseObjectsInfo(t, repo, time.Now(), LooseObjectsInfo{ + Count: 2, + Size: 2, + }) + + for i, objectPath := range objectPaths { + // Modify the object's mtime should cause it to be counted. + require.NoError(t, os.Chtimes(objectPath, beforeCutoffDate, beforeCutoffDate)) + + requireLooseObjectsInfo(t, repo, time.Now(), LooseObjectsInfo{ + Count: 2, + Size: 2, + StaleCount: uint64(i) + 1, + StaleSize: uint64(i) + 1, + }) + } + }) + + t.Run("shard with garbage", func(t *testing.T) { + repo, repoPath := createRepo(t) + + shard := filepath.Join(repoPath, "objects", "17") + require.NoError(t, os.MkdirAll(shard, 0o755)) + + require.NoError(t, os.WriteFile(filepath.Join(shard, "012345"), []byte("valid"), 0o644)) + require.NoError(t, os.WriteFile(filepath.Join(shard, "garbage"), []byte("garbage"), 0o644)) + + requireLooseObjectsInfo(t, repo, time.Now(), LooseObjectsInfo{ + Count: 1, + Size: 5, + StaleCount: 1, + StaleSize: 5, + GarbageCount: 1, + GarbageSize: 7, + }) + }) +} + +func BenchmarkCountLooseObjects(b *testing.B) { + ctx := testhelper.Context(b) + cfg := testcfg.Build(b) + + createRepo := func(b *testing.B) (*localrepo.Repo, string) { + repoProto, repoPath := gittest.CreateRepository(b, ctx, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + return localrepo.NewTestRepo(b, cfg, repoProto), repoPath + } + + b.Run("empty repository", func(b *testing.B) { + repo, _ := createRepo(b) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := LooseObjectsInfoForRepository(repo, time.Now()) + require.NoError(b, err) + } + }) + + b.Run("repository with single object", func(b *testing.B) { + repo, repoPath := createRepo(b) + + objectPath := filepath.Join(repoPath, "objects", "17", "12345") + require.NoError(b, os.Mkdir(filepath.Dir(objectPath), 0o755)) + require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := LooseObjectsInfoForRepository(repo, time.Now()) + require.NoError(b, err) + } + }) + + b.Run("repository with single object in each shard", func(b *testing.B) { + repo, repoPath := createRepo(b) + + for i := 0; i < 256; i++ { + objectPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i), "12345") + require.NoError(b, os.Mkdir(filepath.Dir(objectPath), 0o755)) + require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := LooseObjectsInfoForRepository(repo, time.Now()) + require.NoError(b, err) + } + }) + + b.Run("repository hitting loose object limit", func(b *testing.B) { + repo, repoPath := createRepo(b) + + // Usually we shouldn't have a lot more than `looseObjectCount` objects in the + // repository because we'd repack as soon as we hit that limit. So this benchmark + // case tries to estimate the usual upper limit for loose objects we'd typically + // have. + // + // Note that we should ideally just use `housekeeping.looseObjectsLimit` here to + // derive that value. But due to a cyclic dependency that's not possible, so we + // just use a hard-coded value instead. + looseObjectCount := 5 + + for i := 0; i < 256; i++ { + shardPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i)) + require.NoError(b, os.Mkdir(shardPath, 0o755)) + + for j := 0; j < looseObjectCount; j++ { + objectPath := filepath.Join(shardPath, fmt.Sprintf("%d", j)) + require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) + } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := LooseObjectsInfoForRepository(repo, time.Now()) + require.NoError(b, err) + } + }) + + b.Run("repository with lots of objects", func(b *testing.B) { + repo, repoPath := createRepo(b) + + for i := 0; i < 256; i++ { + shardPath := filepath.Join(repoPath, "objects", fmt.Sprintf("%02x", i)) + require.NoError(b, os.Mkdir(shardPath, 0o755)) + + for j := 0; j < 1000; j++ { + objectPath := filepath.Join(shardPath, fmt.Sprintf("%d", j)) + require.NoError(b, os.WriteFile(objectPath, nil, 0o644)) + } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := LooseObjectsInfoForRepository(repo, time.Now()) + require.NoError(b, err) + } + }) +} + func TestPackfileInfoForRepository(t *testing.T) { t.Parallel() |