Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorToon Claes <toon@gitlab.com>2022-06-03 18:10:22 +0300
committerToon Claes <toon@gitlab.com>2022-07-08 11:24:17 +0300
commitefd9a598f50e03f05620b56f2e010600128f3b1c (patch)
treed5dc45554edb74a3342dbe551361f646321ef34a /internal/gitaly/linguist/linguist.go
parent5709020ecbf5ff34658767230053225eae643603 (diff)
linguist: Implement Stats in pure Go
This change adds an alternative implementation of linguist.Stats using go-enry as a pure Go solution. The code is behind a default disabled feature flag 'go_language_stats'. Issue: https://gitlab.com/gitlab-org/gitaly/-/issues/2571 Changelog: performance
Diffstat (limited to 'internal/gitaly/linguist/linguist.go')
-rw-r--r--internal/gitaly/linguist/linguist.go103
1 files changed, 103 insertions, 0 deletions
diff --git a/internal/gitaly/linguist/linguist.go b/internal/gitaly/linguist/linguist.go
index 5e09431e3..2a1a92c60 100644
--- a/internal/gitaly/linguist/linguist.go
+++ b/internal/gitaly/linguist/linguist.go
@@ -10,12 +10,16 @@ import (
"os/exec"
"path/filepath"
+ "github.com/go-enry/go-enry/v2"
+ "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus"
"gitlab.com/gitlab-org/gitaly/v15/internal/command"
"gitlab.com/gitlab-org/gitaly/v15/internal/git"
"gitlab.com/gitlab-org/gitaly/v15/internal/git/catfile"
+ "gitlab.com/gitlab-org/gitaly/v15/internal/git/gitpipe"
"gitlab.com/gitlab-org/gitaly/v15/internal/git/localrepo"
"gitlab.com/gitlab-org/gitaly/v15/internal/gitaly/config"
"gitlab.com/gitlab-org/gitaly/v15/internal/helper/env"
+ "gitlab.com/gitlab-org/gitaly/v15/internal/metadata/featureflag"
)
// Language is used to parse Linguist's language.json file.
@@ -56,6 +60,10 @@ func New(cfg config.Cfg, gitCmdFactory git.CommandFactory) (*Instance, error) {
// Stats returns the repository's language stats as reported by 'git-linguist'.
func (inst *Instance) Stats(ctx context.Context, repo *localrepo.Repo, commitID string, catfileCache catfile.Cache) (ByteCountPerLanguage, error) {
+ if featureflag.GoLanguageStats.IsEnabled(ctx) {
+ return inst.enryStats(ctx, repo, commitID, catfileCache)
+ }
+
repoPath, err := repo.Path()
if err != nil {
return nil, fmt.Errorf("get repo path: %w", err)
@@ -152,3 +160,98 @@ func openLanguagesJSON(cfg config.Cfg) (io.ReadCloser, error) {
return os.Open(filepath.Join(linguistPathSymlink.Name(), "lib", "linguist", "languages.json"))
}
+
+func (inst *Instance) enryStats(ctx context.Context, repo *localrepo.Repo, commitID string, catfileCache catfile.Cache) (ByteCountPerLanguage, error) {
+ stats, err := newLanguageStats(repo)
+ if err != nil {
+ ctxlogrus.Extract(ctx).WithError(err).Info("linguist load from cache")
+ }
+ if stats.CommitID == commitID {
+ return stats.Totals, nil
+ }
+
+ objectReader, cancel, err := catfileCache.ObjectReader(ctx, repo)
+ if err != nil {
+ return nil, fmt.Errorf("create object reader: %w", err)
+ }
+ defer cancel()
+
+ var revlistIt gitpipe.RevisionIterator
+
+ if stats.CommitID == "" {
+ // No existing stats cached, so get all the files for the commit
+ // using git-ls-tree(1).
+ revlistIt = gitpipe.LsTree(ctx, repo,
+ commitID,
+ gitpipe.LsTreeWithRecursive(),
+ gitpipe.LsTreeWithBlobFilter(),
+ )
+ } else {
+ // Stats are cached for one commit, so get the git-diff-tree(1)
+ // between that commit and the one we're calculating stats for.
+
+ skipDeleted := func(result *gitpipe.RevisionResult) bool {
+ // Skip files that are deleted.
+ if result.OID.IsZeroOID() {
+ // It's a little bit of a hack to use this skip
+ // function, but for every file that's deleted,
+ // remove the stats.
+ stats.drop(string(result.ObjectName))
+ return true
+ }
+ return false
+ }
+
+ revlistIt = gitpipe.DiffTree(ctx, repo,
+ stats.CommitID, commitID,
+ gitpipe.DiffTreeWithRecursive(),
+ gitpipe.DiffTreeWithIgnoreSubmodules(),
+ gitpipe.DiffTreeWithSkip(skipDeleted),
+ )
+ }
+
+ objectIt, err := gitpipe.CatfileObject(ctx, objectReader, revlistIt)
+ if err != nil {
+ return nil, fmt.Errorf("linguist gitpipe: %w", err)
+ }
+
+ for objectIt.Next() {
+ object := objectIt.Result()
+ filename := string(object.ObjectName)
+
+ // Read arbitrary number of bytes considered enough to determine language
+ content, err := io.ReadAll(io.LimitReader(object, 2048))
+ if err != nil {
+ return nil, fmt.Errorf("linguist read blob: %w", err)
+ }
+
+ if _, err := io.Copy(io.Discard, object); err != nil {
+ return nil, fmt.Errorf("linguist discard excess blob: %w", err)
+ }
+
+ lang := enry.GetLanguage(filename, content)
+
+ // Ignore anything that's neither markup nor a programming language,
+ // similar to what the linguist gem does:
+ // https://github.com/github/linguist/blob/v7.20.0/lib/linguist/blob_helper.rb#L378-L387
+ if enry.GetLanguageType(lang) != enry.Programming &&
+ enry.GetLanguageType(lang) != enry.Markup {
+ // The file might have been included in the stats before
+ stats.drop(filename)
+
+ continue
+ }
+
+ stats.add(filename, lang, uint64(object.Object.ObjectSize()))
+ }
+
+ if err := objectIt.Err(); err != nil {
+ return nil, fmt.Errorf("linguist object iterator: %w", err)
+ }
+
+ if err := stats.save(repo, commitID); err != nil {
+ return nil, fmt.Errorf("linguist language stats save: %w", err)
+ }
+
+ return stats.Totals, nil
+}