diff options
author | Toon Claes <toon@gitlab.com> | 2022-06-03 17:45:42 +0300 |
---|---|---|
committer | Toon Claes <toon@gitlab.com> | 2022-07-08 11:24:17 +0300 |
commit | 5709020ecbf5ff34658767230053225eae643603 (patch) | |
tree | 5b606ab9c2b52c4a9da9a3d300a92bc21e254c6f | |
parent | 228c7f63daf73362e2ec8daa18026d63b8161324 (diff) |
linguist: Implement struct to store stats
We're about to introduce an implementation of getting the language
statistics in Go. For this we need a way to collect and store the
results in a cache. This cache will be used to incrementally calculate the
stats between commits.
This change introduces the linguist.languageStats struct which will deal
with all this.
-rw-r--r-- | internal/gitaly/linguist/language_stats.go | 157 | ||||
-rw-r--r-- | internal/gitaly/linguist/language_stats_test.go | 195 |
2 files changed, 352 insertions, 0 deletions
diff --git a/internal/gitaly/linguist/language_stats.go b/internal/gitaly/linguist/language_stats.go new file mode 100644 index 000000000..9dad17c11 --- /dev/null +++ b/internal/gitaly/linguist/language_stats.go @@ -0,0 +1,157 @@ +package linguist + +import ( + "compress/zlib" + "encoding/json" + "fmt" + "os" + "path/filepath" + "sync" + + "gitlab.com/gitlab-org/gitaly/v15/internal/git/localrepo" +) + +const ( + // languageStatsFilename is the name of the file in the repo that stores + // a cached version of the language statistics. The name is + // intentionally different from what the linguist gem uses. + languageStatsFilename = "gitaly-language.stats" + languageStatsVersion = "v1:gitaly" +) + +// languageStats takes care of accumulating and caching language statistics for +// a repository. +type languageStats struct { + // Version holds the file format version + Version string `json:"version"` + // CommitID holds the commit ID for the cached Totals + CommitID string `json:"commit_id"` + + // m will protect concurrent writes to Totals & ByFile maps + m sync.Mutex + + // Totals contains the total statistics for the CommitID + Totals ByteCountPerLanguage `json:"totals"` + // ByFile contains the statistics for a single file, where the filename + // is it's key. + ByFile map[string]ByteCountPerLanguage `json:"by_file"` +} + +// newLanguageStats creates a languageStats object and tries to load the +// optionally available stats from file. +func newLanguageStats(repo *localrepo.Repo) (*languageStats, error) { + stats := languageStats{ + Totals: ByteCountPerLanguage{}, + ByFile: make(map[string]ByteCountPerLanguage), + } + + objPath, err := repo.Path() + if err != nil { + return &stats, fmt.Errorf("new language stats get repo path: %w", err) + } + + file, err := os.Open(filepath.Join(objPath, languageStatsFilename)) + if err != nil { + if os.IsNotExist(err) { + return &stats, nil + } + return &stats, fmt.Errorf("new language stats open: %w", err) + } + defer file.Close() + + r, err := zlib.NewReader(file) + if err != nil { + return &stats, fmt.Errorf("new language stats zlib reader: %w", err) + } + + var loaded languageStats + if err = json.NewDecoder(r).Decode(&loaded); err != nil { + return &stats, fmt.Errorf("new language stats json decode: %w", err) + } + + if loaded.Version != languageStatsVersion { + return &stats, fmt.Errorf("new language stats version mismatch %s vs %s", languageStatsVersion, loaded.Version) + } + + return &loaded, nil +} + +// add the statistics for the given filename +func (c *languageStats) add(filename, language string, size uint64) { + c.m.Lock() + defer c.m.Unlock() + + for k, v := range c.ByFile[filename] { + c.Totals[k] -= v + if c.Totals[k] <= 0 { + delete(c.Totals, k) + } + } + + c.ByFile[filename] = ByteCountPerLanguage{language: size} + c.Totals[language] += size +} + +// drop statistics for the given files +func (c *languageStats) drop(filenames ...string) { + c.m.Lock() + defer c.m.Unlock() + + for _, f := range filenames { + for k, v := range c.ByFile[f] { + c.Totals[k] -= v + if c.Totals[k] <= 0 { + delete(c.Totals, k) + } + } + delete(c.ByFile, f) + } +} + +// save the language stats to file in the repository +func (c *languageStats) save(repo *localrepo.Repo, commitID string) error { + c.CommitID = commitID + c.Version = languageStatsVersion + + repoPath, err := repo.Path() + if err != nil { + return fmt.Errorf("languageStats save get repo path: %w", err) + } + + tempPath, err := repo.StorageTempDir() + if err != nil { + return fmt.Errorf("languageStats locate temp dir: %w", err) + } + + file, err := os.CreateTemp(tempPath, languageStatsFilename) + if err != nil { + return fmt.Errorf("languageStats create temp file: %w", err) + } + defer func() { + file.Close() + _ = os.Remove(file.Name()) + }() + + w := zlib.NewWriter(file) + defer w.Close() + + if err = json.NewEncoder(w).Encode(c); err != nil { + return fmt.Errorf("languageStats encode json: %w", err) + } + + if err = w.Close(); err != nil { + return fmt.Errorf("languageStats zlib write: %w", err) + } + if err = file.Sync(); err != nil { + return fmt.Errorf("languageStats flush: %w", err) + } + if err = file.Close(); err != nil { + return fmt.Errorf("languageStats close: %w", err) + } + + if err = os.Rename(file.Name(), filepath.Join(repoPath, languageStatsFilename)); err != nil { + return fmt.Errorf("languageStats rename: %w", err) + } + + return nil +} diff --git a/internal/gitaly/linguist/language_stats_test.go b/internal/gitaly/linguist/language_stats_test.go new file mode 100644 index 000000000..56f03bce7 --- /dev/null +++ b/internal/gitaly/linguist/language_stats_test.go @@ -0,0 +1,195 @@ +package linguist + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v15/internal/git/gittest" + "gitlab.com/gitlab-org/gitaly/v15/internal/git/localrepo" + "gitlab.com/gitlab-org/gitaly/v15/internal/testhelper/testcfg" +) + +func TestNewLanguageStats(t *testing.T) { + t.Parallel() + + cfg := testcfg.Build(t) + repoProto, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0]) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + t.Run("non-existing cache", func(t *testing.T) { + s, err := newLanguageStats(repo) + require.NoError(t, err) + require.Empty(t, s.Totals) + require.Empty(t, s.ByFile) + }) + + t.Run("pre-existing cache", func(t *testing.T) { + s, err := newLanguageStats(repo) + require.NoError(t, err) + + s.Totals["C"] = 555 + require.NoError(t, s.save(repo, "badcafe")) + + require.Equal(t, ByteCountPerLanguage{"C": 555}, s.Totals) + }) + + t.Run("corrupt cache", func(t *testing.T) { + require.NoError(t, os.WriteFile(filepath.Join(repoPath, languageStatsFilename), []byte("garbage"), 0o644)) + + s, err := newLanguageStats(repo) + require.Errorf(t, err, "new language stats zlib reader: invalid header") + require.Empty(t, s.Totals) + require.Empty(t, s.ByFile) + }) +} + +func TestLanguageStats_add(t *testing.T) { + t.Parallel() + + cfg := testcfg.Build(t) + repoProto, _ := gittest.CloneRepo(t, cfg, cfg.Storages[0]) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + for _, tc := range []struct { + desc string + run func(*testing.T, *languageStats) + }{ + { + desc: "adds to the total", + run: func(t *testing.T, s *languageStats) { + s.add("main.go", "Go", 100) + + require.Equal(t, uint64(100), s.Totals["Go"]) + require.Len(t, s.ByFile, 1) + require.Equal(t, ByteCountPerLanguage{"Go": 100}, s.ByFile["main.go"]) + }, + }, + { + desc: "accumulates", + run: func(t *testing.T, s *languageStats) { + s.add("main.go", "Go", 100) + s.add("main_test.go", "Go", 80) + + require.Equal(t, uint64(180), s.Totals["Go"]) + require.Len(t, s.ByFile, 2) + require.Equal(t, ByteCountPerLanguage{"Go": 100}, s.ByFile["main.go"]) + require.Equal(t, ByteCountPerLanguage{"Go": 80}, s.ByFile["main_test.go"]) + }, + }, + { + desc: "languages don't interfere", + run: func(t *testing.T, s *languageStats) { + s.add("main.go", "Go", 60) + s.add("Makefile", "Make", 30) + + require.Equal(t, uint64(60), s.Totals["Go"]) + require.Equal(t, uint64(30), s.Totals["Make"]) + require.Len(t, s.ByFile, 2) + require.Equal(t, ByteCountPerLanguage{"Go": 60}, s.ByFile["main.go"]) + require.Equal(t, ByteCountPerLanguage{"Make": 30}, s.ByFile["Makefile"]) + }, + }, + { + desc: "updates the stat for a file", + run: func(t *testing.T, s *languageStats) { + s.add("main.go", "Go", 60) + s.add("main.go", "Go", 30) + + require.Equal(t, uint64(30), s.Totals["Go"]) + require.Len(t, s.ByFile, 1) + require.Equal(t, ByteCountPerLanguage{"Go": 30}, s.ByFile["main.go"]) + }, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + s, err := newLanguageStats(repo) + require.NoError(t, err) + + tc.run(t, s) + }) + } +} + +func TestLanguageStats_drop(t *testing.T) { + t.Parallel() + + cfg := testcfg.Build(t) + repoProto, _ := gittest.CloneRepo(t, cfg, cfg.Storages[0]) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + for _, tc := range []struct { + desc string + run func(*testing.T, *languageStats) + }{ + { + desc: "existing file", + run: func(t *testing.T, s *languageStats) { + s.drop("main.go") + + require.Equal(t, uint64(20), s.Totals["Go"]) + require.Len(t, s.ByFile, 1) + require.Equal(t, ByteCountPerLanguage{"Go": 20}, s.ByFile["main_test.go"]) + }, + }, + { + desc: "non-existing file", + run: func(t *testing.T, s *languageStats) { + s.drop("foo.go") + + require.Equal(t, uint64(100), s.Totals["Go"]) + require.Len(t, s.ByFile, 2) + require.Equal(t, ByteCountPerLanguage{"Go": 80}, s.ByFile["main.go"]) + require.Equal(t, ByteCountPerLanguage{"Go": 20}, s.ByFile["main_test.go"]) + }, + }, + { + desc: "all files", + run: func(t *testing.T, s *languageStats) { + s.drop("main.go", "main_test.go") + + require.Empty(t, s.Totals) + require.Empty(t, s.ByFile) + }, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + s, err := newLanguageStats(repo) + require.NoError(t, err) + + s.Totals["Go"] = 100 + s.ByFile["main.go"] = ByteCountPerLanguage{"Go": 80} + s.ByFile["main_test.go"] = ByteCountPerLanguage{"Go": 20} + + tc.run(t, s) + }) + } +} + +func TestLanguageStats_save(t *testing.T) { + t.Parallel() + + cfg := testcfg.Build(t) + repoProto, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0]) + repo := localrepo.NewTestRepo(t, cfg, repoProto) + + s, err := newLanguageStats(repo) + require.NoError(t, err) + + s.Totals["Go"] = 100 + s.ByFile["main.go"] = ByteCountPerLanguage{"Go": 80} + s.ByFile["main_test.go"] = ByteCountPerLanguage{"Go": 20} + + err = s.save(repo, "buzz") + require.NoError(t, err) + require.FileExists(t, filepath.Join(repoPath, languageStatsFilename)) + + loaded, err := newLanguageStats(repo) + require.NoError(t, err) + + require.Equal(t, "buzz", loaded.CommitID) + require.Equal(t, languageStatsVersion, loaded.Version) + require.Equal(t, s.Totals, loaded.Totals) + require.Equal(t, s.ByFile, loaded.ByFile) +} |