diff options
author | Toon Claes <toon@gitlab.com> | 2022-06-03 18:10:22 +0300 |
---|---|---|
committer | Toon Claes <toon@gitlab.com> | 2022-07-08 11:24:17 +0300 |
commit | efd9a598f50e03f05620b56f2e010600128f3b1c (patch) | |
tree | d5dc45554edb74a3342dbe551361f646321ef34a /internal/gitaly/linguist/linguist.go | |
parent | 5709020ecbf5ff34658767230053225eae643603 (diff) |
linguist: Implement Stats in pure Go
This change adds an alternative implementation of linguist.Stats using
go-enry as a pure Go solution. The code is behind a default disabled
feature flag 'go_language_stats'.
Issue: https://gitlab.com/gitlab-org/gitaly/-/issues/2571
Changelog: performance
Diffstat (limited to 'internal/gitaly/linguist/linguist.go')
-rw-r--r-- | internal/gitaly/linguist/linguist.go | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/internal/gitaly/linguist/linguist.go b/internal/gitaly/linguist/linguist.go index 5e09431e3..2a1a92c60 100644 --- a/internal/gitaly/linguist/linguist.go +++ b/internal/gitaly/linguist/linguist.go @@ -10,12 +10,16 @@ import ( "os/exec" "path/filepath" + "github.com/go-enry/go-enry/v2" + "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" "gitlab.com/gitlab-org/gitaly/v15/internal/command" "gitlab.com/gitlab-org/gitaly/v15/internal/git" "gitlab.com/gitlab-org/gitaly/v15/internal/git/catfile" + "gitlab.com/gitlab-org/gitaly/v15/internal/git/gitpipe" "gitlab.com/gitlab-org/gitaly/v15/internal/git/localrepo" "gitlab.com/gitlab-org/gitaly/v15/internal/gitaly/config" "gitlab.com/gitlab-org/gitaly/v15/internal/helper/env" + "gitlab.com/gitlab-org/gitaly/v15/internal/metadata/featureflag" ) // Language is used to parse Linguist's language.json file. @@ -56,6 +60,10 @@ func New(cfg config.Cfg, gitCmdFactory git.CommandFactory) (*Instance, error) { // Stats returns the repository's language stats as reported by 'git-linguist'. func (inst *Instance) Stats(ctx context.Context, repo *localrepo.Repo, commitID string, catfileCache catfile.Cache) (ByteCountPerLanguage, error) { + if featureflag.GoLanguageStats.IsEnabled(ctx) { + return inst.enryStats(ctx, repo, commitID, catfileCache) + } + repoPath, err := repo.Path() if err != nil { return nil, fmt.Errorf("get repo path: %w", err) @@ -152,3 +160,98 @@ func openLanguagesJSON(cfg config.Cfg) (io.ReadCloser, error) { return os.Open(filepath.Join(linguistPathSymlink.Name(), "lib", "linguist", "languages.json")) } + +func (inst *Instance) enryStats(ctx context.Context, repo *localrepo.Repo, commitID string, catfileCache catfile.Cache) (ByteCountPerLanguage, error) { + stats, err := newLanguageStats(repo) + if err != nil { + ctxlogrus.Extract(ctx).WithError(err).Info("linguist load from cache") + } + if stats.CommitID == commitID { + return stats.Totals, nil + } + + objectReader, cancel, err := catfileCache.ObjectReader(ctx, repo) + if err != nil { + return nil, fmt.Errorf("create object reader: %w", err) + } + defer cancel() + + var revlistIt gitpipe.RevisionIterator + + if stats.CommitID == "" { + // No existing stats cached, so get all the files for the commit + // using git-ls-tree(1). + revlistIt = gitpipe.LsTree(ctx, repo, + commitID, + gitpipe.LsTreeWithRecursive(), + gitpipe.LsTreeWithBlobFilter(), + ) + } else { + // Stats are cached for one commit, so get the git-diff-tree(1) + // between that commit and the one we're calculating stats for. + + skipDeleted := func(result *gitpipe.RevisionResult) bool { + // Skip files that are deleted. + if result.OID.IsZeroOID() { + // It's a little bit of a hack to use this skip + // function, but for every file that's deleted, + // remove the stats. + stats.drop(string(result.ObjectName)) + return true + } + return false + } + + revlistIt = gitpipe.DiffTree(ctx, repo, + stats.CommitID, commitID, + gitpipe.DiffTreeWithRecursive(), + gitpipe.DiffTreeWithIgnoreSubmodules(), + gitpipe.DiffTreeWithSkip(skipDeleted), + ) + } + + objectIt, err := gitpipe.CatfileObject(ctx, objectReader, revlistIt) + if err != nil { + return nil, fmt.Errorf("linguist gitpipe: %w", err) + } + + for objectIt.Next() { + object := objectIt.Result() + filename := string(object.ObjectName) + + // Read arbitrary number of bytes considered enough to determine language + content, err := io.ReadAll(io.LimitReader(object, 2048)) + if err != nil { + return nil, fmt.Errorf("linguist read blob: %w", err) + } + + if _, err := io.Copy(io.Discard, object); err != nil { + return nil, fmt.Errorf("linguist discard excess blob: %w", err) + } + + lang := enry.GetLanguage(filename, content) + + // Ignore anything that's neither markup nor a programming language, + // similar to what the linguist gem does: + // https://github.com/github/linguist/blob/v7.20.0/lib/linguist/blob_helper.rb#L378-L387 + if enry.GetLanguageType(lang) != enry.Programming && + enry.GetLanguageType(lang) != enry.Markup { + // The file might have been included in the stats before + stats.drop(filename) + + continue + } + + stats.add(filename, lang, uint64(object.Object.ObjectSize())) + } + + if err := objectIt.Err(); err != nil { + return nil, fmt.Errorf("linguist object iterator: %w", err) + } + + if err := stats.save(repo, commitID); err != nil { + return nil, fmt.Errorf("linguist language stats save: %w", err) + } + + return stats.Totals, nil +} |