Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Steinhardt <psteinhardt@gitlab.com>2022-10-19 15:57:45 +0300
committerPatrick Steinhardt <psteinhardt@gitlab.com>2022-10-19 17:25:48 +0300
commitab42b0455b2417696c296c218c044e7fe7b2095d (patch)
tree614cc70b32eae2e0b79f158430d7825f216326db
parent2e8976dddc947445f33e71ba15bfd6ad6b172138 (diff)
git/stats: Refactor code parsing git-count-objects(1) to use struct
The stats package contains code that parses git-count-objects(1)'s output into an unstructured map. This map is then used to provide multiple different bits of information like the object count or gets used to write a log message. This information is more generally useful though. One usecase is to precompute the current repository's shape in `OptimizeRepository()` and pass it around for use by our heuristics that determine whether things need repacking or not. There is also additional information that's not provided by Git, like whether repositories have bitmaps or not, that we could easily compute here. But the current design where we parse information into an unstructured map is holding us back: it is not discoverable, makes refactorings unsafe, and is simply not a great interface to use more generally. Refactor the code into a new `ObjectsInfo` structure instead to change this. While it requires more intimacy with git-count-objects(1), this seems like an acceptable tradeoff to make wider use of this interface in the future. Note that this also causes us to change the logged format. We don't consider log messages to be part of our API though, and arguably the new structure should be easier to understand (e.g. "loose_objects_size" compared to the old "size").
-rw-r--r--internal/git/housekeeping/objects_test.go2
-rw-r--r--internal/git/stats/objects_info.go171
-rw-r--r--internal/git/stats/objects_info_test.go181
-rw-r--r--internal/gitaly/service/objectpool/fetch_into_object_pool_test.go22
-rw-r--r--internal/gitaly/service/repository/repack_test.go11
5 files changed, 246 insertions, 141 deletions
diff --git a/internal/git/housekeeping/objects_test.go b/internal/git/housekeeping/objects_test.go
index 8294703d9..5f6b0ec07 100644
--- a/internal/git/housekeeping/objects_test.go
+++ b/internal/git/housekeeping/objects_test.go
@@ -15,7 +15,7 @@ import (
"gitlab.com/gitlab-org/gitaly/v15/internal/testhelper/testcfg"
)
-func requireObjectCount(t *testing.T, ctx context.Context, repo *localrepo.Repo, expectedObjects int64) {
+func requireObjectCount(t *testing.T, ctx context.Context, repo *localrepo.Repo, expectedObjects uint64) {
t.Helper()
objects, err := stats.LooseObjects(ctx, repo)
diff --git a/internal/git/stats/objects_info.go b/internal/git/stats/objects_info.go
index 12a8e4643..5ef1c9289 100644
--- a/internal/git/stats/objects_info.go
+++ b/internal/git/stats/objects_info.go
@@ -3,8 +3,7 @@ package stats
import (
"bufio"
"context"
- "errors"
- "io"
+ "fmt"
"io/fs"
"os"
"path/filepath"
@@ -53,127 +52,115 @@ func GetPackfiles(repoPath string) ([]fs.DirEntry, error) {
}
// LooseObjects returns the number of loose objects that are not in a packfile.
-func LooseObjects(ctx context.Context, repo git.RepositoryExecutor) (int64, error) {
- cmd, err := repo.Exec(ctx, git.SubCmd{
- Name: "count-objects",
- Flags: []git.Option{git.Flag{Name: "--verbose"}},
- })
- if err != nil {
- return 0, err
- }
-
- objectStats, err := readObjectInfoStatistic(cmd)
+func LooseObjects(ctx context.Context, repo git.RepositoryExecutor) (uint64, error) {
+ objectsInfo, err := ObjectsInfoForRepository(ctx, repo)
if err != nil {
return 0, err
}
- count, ok := objectStats["count"].(int64)
- if !ok {
- return 0, errors.New("could not get object count")
- }
-
- return count, nil
+ return objectsInfo.LooseObjects, nil
}
// LogObjectsInfo read statistics of the git repo objects
-// and logs it under 'count-objects' key as structured entry.
+// and logs it under 'objects_info' key as structured entry.
func LogObjectsInfo(ctx context.Context, repo git.RepositoryExecutor) {
logger := ctxlogrus.Extract(ctx)
- cmd, err := repo.Exec(ctx, git.SubCmd{
- Name: "count-objects",
- Flags: []git.Option{git.Flag{Name: "--verbose"}},
- })
- if err != nil {
- logger.WithError(err).Warn("failed on bootstrapping to gather object statistic")
- return
- }
-
- stats, err := readObjectInfoStatistic(cmd)
+ objectsInfo, err := ObjectsInfoForRepository(ctx, repo)
if err != nil {
- logger.WithError(err).Warn("failed on reading to gather object statistic")
- }
-
- if err := cmd.Wait(); err != nil {
- logger.WithError(err).Warn("failed on waiting to gather object statistic")
- return
- }
-
- if len(stats) > 0 {
- logger.WithField("count_objects", stats).Info("git repo statistic")
+ logger.WithError(err).Warn("failed reading objects info")
+ } else {
+ logger.WithField("objects_info", objectsInfo).Info("repository objects info")
}
}
-/*
- readObjectInfoStatistic parses output of 'git count-objects -v' command and represents it as dictionary
-
-current supported format is:
-
- count: 12
- packs: 2
- size-garbage: 934
- alternate: /some/path/to/.git/objects
- alternate: "/some/other path/to/.git/objects"
-
-will result in:
+// ObjectsInfo contains information on the object database.
+type ObjectsInfo struct {
+ // LooseObjects is the count of loose objects.
+ LooseObjects uint64 `json:"loose_objects"`
+ // LooseObjectsSize is the accumulated on-disk size of all loose objects in KiB.
+ LooseObjectsSize uint64 `json:"loose_objects_size"`
+ // PackedObjects is the count of packed objects.
+ PackedObjects uint64 `json:"packed_objects"`
+ // Packfiles is the number of packfiles.
+ Packfiles uint64 `json:"packfiles"`
+ // PackfilesSize is the accumulated on-disk size of all packfiles in KiB.
+ PackfilesSize uint64 `json:"packfiles_size"`
+ // PrunableObjects is the number of objects that exist both as loose and as packed objects.
+ // The loose objects may be pruned in that case.
+ PruneableObjects uint64 `json:"prunable_objects"`
+ // Garbage is the count of files in the object database that are neither a valid loose
+ // object nor a valid packfile.
+ Garbage uint64 `json:"garbage"`
+ // GarbageSize is the accumulated on-disk size of garbage files.
+ GarbageSize uint64 `json:"garbage_size"`
+ // Alternates is the list of absolute paths of alternate object databases this repository is
+ // connected to.
+ Alternates []string `json:"alternates"`
+}
- {
- "count": 12,
- "packs": 2,
- "size-garbage": 934,
- "alternate": ["/some/path/to/.git/objects", "/some/other path/to/.git/objects"]
+// ObjectsInfoForRepository computes the ObjectsInfo for a repository.
+func ObjectsInfoForRepository(ctx context.Context, repo git.RepositoryExecutor) (ObjectsInfo, error) {
+ countObjects, err := repo.Exec(ctx, git.SubCmd{
+ Name: "count-objects",
+ Flags: []git.Option{git.Flag{Name: "--verbose"}},
+ })
+ if err != nil {
+ return ObjectsInfo{}, fmt.Errorf("running git-count-objects: %w", err)
}
-*/
-func readObjectInfoStatistic(reader io.Reader) (map[string]interface{}, error) {
- stats := map[string]interface{}{}
- scanner := bufio.NewScanner(reader)
+ var info ObjectsInfo
+
+ // The expected format is:
+ //
+ // count: 12
+ // packs: 2
+ // size-garbage: 934
+ // alternate: /some/path/to/.git/objects
+ // alternate: "/some/other path/to/.git/objects"
+ scanner := bufio.NewScanner(countObjects)
for scanner.Scan() {
line := scanner.Text()
+
parts := strings.SplitN(line, ": ", 2)
if len(parts) != 2 {
continue
}
- // one of: count, size, in-pack, packs, size-pack, prune-packable, garbage, size-garbage, alternate (repeatable)
- key := parts[0]
- rawVal := strings.TrimPrefix(parts[1], ": ")
-
- switch key {
+ var err error
+ switch parts[0] {
+ case "count":
+ info.LooseObjects, err = strconv.ParseUint(parts[1], 10, 64)
+ case "size":
+ info.LooseObjectsSize, err = strconv.ParseUint(parts[1], 10, 64)
+ case "in-pack":
+ info.PackedObjects, err = strconv.ParseUint(parts[1], 10, 64)
+ case "packs":
+ info.Packfiles, err = strconv.ParseUint(parts[1], 10, 64)
+ case "size-pack":
+ info.PackfilesSize, err = strconv.ParseUint(parts[1], 10, 64)
+ case "prune-packable":
+ info.PruneableObjects, err = strconv.ParseUint(parts[1], 10, 64)
+ case "garbage":
+ info.Garbage, err = strconv.ParseUint(parts[1], 10, 64)
+ case "size-garbage":
+ info.GarbageSize, err = strconv.ParseUint(parts[1], 10, 64)
case "alternate":
- addMultiString(stats, key, rawVal)
- default:
- addInt(stats, key, rawVal)
+ info.Alternates = append(info.Alternates, strings.Trim(parts[1], "\" \t\n"))
}
- }
-
- return stats, scanner.Err()
-}
-
-func addMultiString(stats map[string]interface{}, key, rawVal string) {
- val := strings.Trim(rawVal, "\" \t\n")
- statVal, found := stats[key]
- if !found {
- stats[key] = val
- return
+ if err != nil {
+ return ObjectsInfo{}, fmt.Errorf("parsing %q: %w", parts[0], err)
+ }
}
- statAggr, ok := statVal.([]string) // 'alternate' is only repeatable key and it is a string type
- if ok {
- statAggr = append(statAggr, val)
- } else {
- delete(stats, key) // remove single string value of 'alternate' to replace it with slice
- statAggr = []string{statVal.(string), val}
+ if err := scanner.Err(); err != nil {
+ return ObjectsInfo{}, fmt.Errorf("scanning object info: %w", err)
}
- stats[key] = statAggr
-}
-func addInt(stats map[string]interface{}, key, rawVal string) {
- val, err := strconv.ParseInt(rawVal, 10, 64)
- if err != nil {
- return
+ if err := countObjects.Wait(); err != nil {
+ return ObjectsInfo{}, fmt.Errorf("counting objects: %w", err)
}
- stats[key] = val
+ return info, nil
}
diff --git a/internal/git/stats/objects_info_test.go b/internal/git/stats/objects_info_test.go
index 7d30a86d5..c77c61c61 100644
--- a/internal/git/stats/objects_info_test.go
+++ b/internal/git/stats/objects_info_test.go
@@ -46,7 +46,7 @@ func TestRepositoryProfile(t *testing.T) {
looseObjects, err := LooseObjects(ctx, repo)
require.NoError(t, err)
- require.Equal(t, int64(blobs), looseObjects)
+ require.Equal(t, uint64(blobs), looseObjects)
for _, blobID := range blobIDs {
commitID := gittest.WriteCommit(t, cfg, repoPath,
@@ -64,7 +64,7 @@ func TestRepositoryProfile(t *testing.T) {
looseObjects, err = LooseObjects(ctx, repo)
require.NoError(t, err)
- require.Equal(t, int64(1), looseObjects)
+ require.Equal(t, uint64(1), looseObjects)
// write another loose object
blobID := gittest.WriteBlobs(t, cfg, repoPath, 1)[0]
@@ -75,7 +75,7 @@ func TestRepositoryProfile(t *testing.T) {
looseObjects, err = LooseObjects(ctx, repo)
require.NoError(t, err)
- require.Equal(t, int64(2), looseObjects)
+ require.Equal(t, uint64(2), looseObjects)
}
func TestLogObjectInfo(t *testing.T) {
@@ -91,25 +91,18 @@ func TestLogObjectInfo(t *testing.T) {
Seed: gittest.SeedGitLabTest,
})
- requireLog := func(entries []*logrus.Entry) map[string]interface{} {
+ requireObjectsInfo := func(entries []*logrus.Entry) ObjectsInfo {
for _, entry := range entries {
- if entry.Message == "git repo statistic" {
- const key = "count_objects"
- data := entry.Data[key]
- require.NotNil(t, data, "there is no any information about statistics")
- countObjects, ok := data.(map[string]interface{})
+ if entry.Message == "repository objects info" {
+ objectsInfo, ok := entry.Data["objects_info"]
require.True(t, ok)
- require.Contains(t, countObjects, "count")
- require.Contains(t, countObjects, "size")
- require.Contains(t, countObjects, "in-pack")
- require.Contains(t, countObjects, "packs")
- require.Contains(t, countObjects, "size-pack")
- require.Contains(t, countObjects, "garbage")
- require.Contains(t, countObjects, "size-garbage")
- return countObjects
+ require.IsType(t, ObjectsInfo{}, objectsInfo)
+ return objectsInfo.(ObjectsInfo)
}
}
- return nil
+
+ require.FailNow(t, "no objects info log entry found")
+ return ObjectsInfo{}
}
t.Run("shared repo with multiple alternates", func(t *testing.T) {
@@ -132,8 +125,8 @@ func TestLogObjectInfo(t *testing.T) {
RelativePath: filepath.Join(strings.TrimPrefix(tmpDir, storagePath), ".git"),
}))
- countObjects := requireLog(hook.AllEntries())
- require.ElementsMatch(t, []string{repoPath1 + "/objects", repoPath2 + "/objects"}, countObjects["alternate"])
+ objectsInfo := requireObjectsInfo(hook.AllEntries())
+ require.Equal(t, []string{repoPath1 + "/objects", repoPath2 + "/objects"}, objectsInfo.Alternates)
})
t.Run("repo without alternates", func(t *testing.T) {
@@ -142,7 +135,151 @@ func TestLogObjectInfo(t *testing.T) {
LogObjectsInfo(testCtx, localrepo.NewTestRepo(t, cfg, repo2))
- countObjects := requireLog(hook.AllEntries())
- require.Contains(t, countObjects, "prune-packable")
+ objectsInfo := requireObjectsInfo(hook.AllEntries())
+ require.NotZero(t, objectsInfo.LooseObjects)
+ require.NotZero(t, objectsInfo.LooseObjectsSize)
+ require.NotZero(t, objectsInfo.PackedObjects)
+ require.NotZero(t, objectsInfo.Packfiles)
+ require.NotZero(t, objectsInfo.PackfilesSize)
+ require.Nil(t, objectsInfo.Alternates)
+ })
+}
+
+func TestObjectsInfoForRepository(t *testing.T) {
+ t.Parallel()
+
+ ctx := testhelper.Context(t)
+ cfg := testcfg.Build(t)
+
+ _, alternatePath := gittest.CreateRepository(t, ctx, cfg, gittest.CreateRepositoryConfig{
+ SkipCreationViaService: true,
})
+ alternatePath = filepath.Join(alternatePath, "objects")
+
+ for _, tc := range []struct {
+ desc string
+ setup func(t *testing.T, repoPath string)
+ expectedErr error
+ expectedObjectsInfo ObjectsInfo
+ }{
+ {
+ desc: "empty repository",
+ setup: func(*testing.T, string) {
+ },
+ },
+ {
+ desc: "single blob",
+ setup: func(t *testing.T, repoPath string) {
+ gittest.WriteBlob(t, cfg, repoPath, []byte("x"))
+ },
+ expectedObjectsInfo: ObjectsInfo{
+ LooseObjects: 1,
+ LooseObjectsSize: 4,
+ },
+ },
+ {
+ desc: "single packed blob",
+ setup: func(t *testing.T, repoPath string) {
+ blobID := gittest.WriteBlob(t, cfg, repoPath, []byte("x"))
+ gittest.WriteRef(t, cfg, repoPath, "refs/tags/blob", blobID)
+ // We use `-d`, which also prunes objects that have been packed.
+ gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad")
+ },
+ expectedObjectsInfo: ObjectsInfo{
+ PackedObjects: 1,
+ Packfiles: 1,
+ PackfilesSize: 1,
+ },
+ },
+ {
+ desc: "single pruneable blob",
+ setup: func(t *testing.T, repoPath string) {
+ blobID := gittest.WriteBlob(t, cfg, repoPath, []byte("x"))
+ gittest.WriteRef(t, cfg, repoPath, "refs/tags/blob", blobID)
+ // This time we don't use `-d`, so the object will exist both in
+ // loose and packed form.
+ gittest.Exec(t, cfg, "-C", repoPath, "repack", "-a")
+ },
+ expectedObjectsInfo: ObjectsInfo{
+ LooseObjects: 1,
+ LooseObjectsSize: 4,
+ PackedObjects: 1,
+ Packfiles: 1,
+ PackfilesSize: 1,
+ PruneableObjects: 1,
+ },
+ },
+ {
+ desc: "garbage",
+ setup: func(t *testing.T, repoPath string) {
+ garbagePath := filepath.Join(repoPath, "objects", "pack", "garbage")
+ require.NoError(t, os.WriteFile(garbagePath, []byte("x"), 0o600))
+ },
+ expectedObjectsInfo: ObjectsInfo{
+ Garbage: 1,
+ // git-count-objects(1) somehow does not count this file's size,
+ // which I've verified manually.
+ GarbageSize: 0,
+ },
+ },
+ {
+ desc: "alternates",
+ setup: func(t *testing.T, repoPath string) {
+ infoAlternatesPath := filepath.Join(repoPath, "objects", "info", "alternates")
+ require.NoError(t, os.WriteFile(infoAlternatesPath, []byte(alternatePath), 0o600))
+ },
+ expectedObjectsInfo: ObjectsInfo{
+ Alternates: []string{
+ alternatePath,
+ },
+ },
+ },
+ {
+ desc: "all together",
+ setup: func(t *testing.T, repoPath string) {
+ infoAlternatesPath := filepath.Join(repoPath, "objects", "info", "alternates")
+ require.NoError(t, os.WriteFile(infoAlternatesPath, []byte(alternatePath), 0o600))
+
+ // We write a single packed blob.
+ blobID := gittest.WriteBlob(t, cfg, repoPath, []byte("x"))
+ gittest.WriteRef(t, cfg, repoPath, "refs/tags/blob", blobID)
+ gittest.Exec(t, cfg, "-C", repoPath, "repack", "-Ad")
+
+ // And two loose ones.
+ gittest.WriteBlob(t, cfg, repoPath, []byte("1"))
+ gittest.WriteBlob(t, cfg, repoPath, []byte("2"))
+
+ // And three garbage-files. This is done so we've got unique counts
+ // everywhere.
+ for _, file := range []string{"garbage1", "garbage2", "garbage3"} {
+ garbagePath := filepath.Join(repoPath, "objects", "pack", file)
+ require.NoError(t, os.WriteFile(garbagePath, []byte("x"), 0o600))
+ }
+ },
+ expectedObjectsInfo: ObjectsInfo{
+ LooseObjects: 2,
+ LooseObjectsSize: 8,
+ PackedObjects: 1,
+ Packfiles: 1,
+ PackfilesSize: 1,
+ Garbage: 3,
+ Alternates: []string{
+ alternatePath,
+ },
+ },
+ },
+ } {
+ t.Run(tc.desc, func(t *testing.T) {
+ repoProto, repoPath := gittest.CreateRepository(t, ctx, cfg, gittest.CreateRepositoryConfig{
+ SkipCreationViaService: true,
+ })
+ repo := localrepo.NewTestRepo(t, cfg, repoProto)
+
+ tc.setup(t, repoPath)
+
+ objectsInfo, err := ObjectsInfoForRepository(ctx, repo)
+ require.Equal(t, tc.expectedErr, err)
+ require.Equal(t, tc.expectedObjectsInfo, objectsInfo)
+ })
+ }
}
diff --git a/internal/gitaly/service/objectpool/fetch_into_object_pool_test.go b/internal/gitaly/service/objectpool/fetch_into_object_pool_test.go
index bb4b28984..389505b76 100644
--- a/internal/gitaly/service/objectpool/fetch_into_object_pool_test.go
+++ b/internal/gitaly/service/objectpool/fetch_into_object_pool_test.go
@@ -20,6 +20,7 @@ import (
"gitlab.com/gitlab-org/gitaly/v15/internal/git/catfile"
"gitlab.com/gitlab-org/gitaly/v15/internal/git/gittest"
"gitlab.com/gitlab-org/gitaly/v15/internal/git/housekeeping"
+ "gitlab.com/gitlab-org/gitaly/v15/internal/git/stats"
"gitlab.com/gitlab-org/gitaly/v15/internal/gitaly/config"
"gitlab.com/gitlab-org/gitaly/v15/internal/gitaly/transaction"
"gitlab.com/gitlab-org/gitaly/v15/internal/metadata"
@@ -251,26 +252,9 @@ func TestFetchIntoObjectPool_CollectLogStatistics(t *testing.T) {
_, err = client.FetchIntoObjectPool(ctx, req)
require.NoError(t, err)
- const key = "count_objects"
for _, logEntry := range hook.AllEntries() {
- if stats, ok := logEntry.Data[key]; ok {
- require.IsType(t, map[string]interface{}{}, stats)
-
- var keys []string
- for key := range stats.(map[string]interface{}) {
- keys = append(keys, key)
- }
-
- require.ElementsMatch(t, []string{
- "count",
- "garbage",
- "in-pack",
- "packs",
- "prune-packable",
- "size",
- "size-garbage",
- "size-pack",
- }, keys)
+ if objectsInfo, ok := logEntry.Data["objects_info"]; ok {
+ require.IsType(t, stats.ObjectsInfo{}, objectsInfo)
return
}
}
diff --git a/internal/gitaly/service/repository/repack_test.go b/internal/gitaly/service/repository/repack_test.go
index acb581305..53fd35bdd 100644
--- a/internal/gitaly/service/repository/repack_test.go
+++ b/internal/gitaly/service/repository/repack_test.go
@@ -222,16 +222,13 @@ func TestRepackFullCollectLogStatistics(t *testing.T) {
func mustCountObjectLog(tb testing.TB, entries ...*logrus.Entry) {
tb.Helper()
- const key = "count_objects"
+ const key = "objects_info"
for _, entry := range entries {
- if entry.Message == "git repo statistic" {
+ if entry.Message == "repository objects info" {
require.Contains(tb, entry.Data, "grpc.request.glProjectPath")
require.Contains(tb, entry.Data, "grpc.request.glRepository")
- require.Contains(tb, entry.Data, key, "statistics not found")
-
- objectStats, ok := entry.Data[key].(map[string]interface{})
- require.True(tb, ok, "expected count_objects to be a map")
- require.Contains(tb, objectStats, "count")
+ require.Contains(tb, entry.Data, key, "objects info not found")
+ require.IsType(tb, stats.ObjectsInfo{}, entry.Data[key])
return
}
}