diff options
author | Toon Claes <toon@gitlab.com> | 2022-09-29 17:30:43 +0300 |
---|---|---|
committer | Toon Claes <toon@gitlab.com> | 2022-10-03 16:26:16 +0300 |
commit | bad57b12f2234d3d1246dce658282d34a7b8f186 (patch) | |
tree | 5f06145bc0e9cac79cf458c0fa6fa5f22899377b | |
parent | aa1e5f03017c4343fbdc3a2d527321ccd8a54f12 (diff) |
linguist: Exclude some files from stats
Files that are either documentation, vendored, or generated should be
excluded from the stats.
-rw-r--r-- | internal/gitaly/linguist/file_instance.go | 41 | ||||
-rw-r--r-- | internal/gitaly/linguist/linguist.go | 16 | ||||
-rw-r--r-- | internal/gitaly/linguist/linguist_test.go | 84 |
3 files changed, 131 insertions, 10 deletions
diff --git a/internal/gitaly/linguist/file_instance.go b/internal/gitaly/linguist/file_instance.go index 5c01da23e..9b5484bf5 100644 --- a/internal/gitaly/linguist/file_instance.go +++ b/internal/gitaly/linguist/file_instance.go @@ -18,6 +18,35 @@ func newFileInstance(filename string) fileInstance { } } +func (f fileInstance) isDocumentation() bool { + return enry.IsDocumentation(f.filename) +} + +func (f fileInstance) isVendored() bool { + return enry.IsVendor(f.filename) +} + +func (f fileInstance) isGenerated(content []byte) bool { + return enry.IsGenerated(f.filename, content) +} + +func (f fileInstance) getLanguage(content []byte) string { + return enry.GetLanguage(f.filename, content) +} + +func (f fileInstance) isIgnoredLanguage(lang string) bool { + // Ignore anything that's neither markup nor a programming language, + // similar to what the linguist gem does: + // https://github.com/github/linguist/blob/v7.20.0/lib/linguist/blob_helper.rb#L378-L387 + return enry.GetLanguageType(lang) != enry.Programming && + enry.GetLanguageType(lang) != enry.Markup +} + +// IsExcluded returns whether +func (f fileInstance) IsExcluded() bool { + return f.isDocumentation() || f.isVendored() +} + // DetermineStats determines the size and language of the given file. The // language will be an empty string when the stats should be omitted from the // count. @@ -28,13 +57,13 @@ func (f fileInstance) DetermineStats(object gitpipe.CatfileObjectResult) (string return "", 0, fmt.Errorf("determine stats read blob: %w", err) } - lang := enry.GetLanguage(f.filename, content) + if f.isGenerated(content) { + return "", 0, nil + } + + lang := f.getLanguage(content) - // Ignore anything that's neither markup nor a programming language, - // similar to what the linguist gem does: - // https://github.com/github/linguist/blob/v7.20.0/lib/linguist/blob_helper.rb#L378-L387 - if enry.GetLanguageType(lang) != enry.Programming && - enry.GetLanguageType(lang) != enry.Markup { + if f.isIgnoredLanguage(lang) { return "", 0, nil } diff --git a/internal/gitaly/linguist/linguist.go b/internal/gitaly/linguist/linguist.go index c17cad698..6152797cc 100644 --- a/internal/gitaly/linguist/linguist.go +++ b/internal/gitaly/linguist/linguist.go @@ -122,20 +122,28 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount var revlistIt gitpipe.RevisionIterator if stats.CommitID == "" { + skipFunc := func(result *gitpipe.RevisionResult) bool { + // Skip files that are an excluded filetype based on filename. + return newFileInstance(string(result.ObjectName)).IsExcluded() + } + // No existing stats cached, so get all the files for the commit // using git-ls-tree(1). revlistIt = gitpipe.LsTree(ctx, inst.repo, commitID, gitpipe.LsTreeWithRecursive(), gitpipe.LsTreeWithBlobFilter(), + gitpipe.LsTreeWithSkip(skipFunc), ) } else { // Stats are cached for one commit, so get the git-diff-tree(1) // between that commit and the one we're calculating stats for. - skipDeleted := func(result *gitpipe.RevisionResult) bool { - // Skip files that are deleted. - if git.ObjectHashSHA1.IsZeroOID(result.OID) { + skipFunc := func(result *gitpipe.RevisionResult) bool { + // Skip files that are deleted, or + // an excluded filetype based on filename. + if git.ObjectHashSHA1.IsZeroOID(result.OID) || + newFileInstance(string(result.ObjectName)).IsExcluded() { // It's a little bit of a hack to use this skip // function, but for every file that's deleted, // remove the stats. @@ -149,7 +157,7 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount stats.CommitID, commitID, gitpipe.DiffTreeWithRecursive(), gitpipe.DiffTreeWithIgnoreSubmodules(), - gitpipe.DiffTreeWithSkip(skipDeleted), + gitpipe.DiffTreeWithSkip(skipFunc), ) } diff --git a/internal/gitaly/linguist/linguist_test.go b/internal/gitaly/linguist/linguist_test.go index 79f0c3edb..8461b2e56 100644 --- a/internal/gitaly/linguist/linguist_test.go +++ b/internal/gitaly/linguist/linguist_test.go @@ -69,6 +69,90 @@ func testInstanceStats(t *testing.T, ctx context.Context) { }, }, { + desc: "documentation is ignored", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + docTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + {Path: "readme.md", Mode: "100644", Content: strings.Repeat("a", 500)}, + {Path: "index.html", Mode: "100644", Content: strings.Repeat("a", 120)}, + {Path: "formatter.rb", Mode: "100644", Content: strings.Repeat("a", 403)}, + }) + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "docs", Mode: "040000", OID: docTree}, + gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)}, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "C": 85, + }, + }, + { + desc: "vendor is ignored", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + vendorTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + {Path: "app.rb", Mode: "100644", Content: strings.Repeat("a", 500)}, + }) + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "vendor", Mode: "040000", OID: vendorTree}, + gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)}, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "C": 85, + }, + }, + { + desc: "generated is ignored", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + podsTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + {Path: "app.swift", Mode: "100644", Content: strings.Repeat("a", 500)}, + }) + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "Pods", Mode: "040000", OID: podsTree}, + gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)}, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "C": 85, + }, + }, + { + desc: "undetectable languages are ignored", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "config.json", Mode: "100644", Content: strings.Repeat("a", 234)}, + gittest.TreeEntry{Path: "manual.md", Mode: "100644", Content: strings.Repeat("a", 553)}, + gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)}, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "C": 85, + }, + }, + { desc: "empty code files", setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ |