diff options
author | Toon Claes <toon@gitlab.com> | 2022-09-20 20:58:55 +0300 |
---|---|---|
committer | Toon Claes <toon@gitlab.com> | 2022-10-03 16:26:16 +0300 |
commit | 2ae7de24d887eb5309420c151bdacdbfd8f880aa (patch) | |
tree | 3d9c497d32887d5df67945db3055379427e09027 | |
parent | bad57b12f2234d3d1246dce658282d34a7b8f186 (diff) |
linguist: Take .gitattributes into account
Users can override [1] the linguist behavior by writing things in
gitattributes(5). This change copies that behavior into the go
implementation.
[1]: https://github.com/github/linguist/blob/master/docs/overrides.md
-rw-r--r-- | go.mod | 2 | ||||
-rw-r--r-- | internal/gitaly/linguist/file_instance.go | 59 | ||||
-rw-r--r-- | internal/gitaly/linguist/linguist.go | 41 | ||||
-rw-r--r-- | internal/gitaly/linguist/linguist_test.go | 189 |
4 files changed, 286 insertions, 5 deletions
@@ -14,6 +14,7 @@ require ( github.com/git-lfs/git-lfs/v3 v3.2.0 github.com/go-enry/go-enry/v2 v2.8.2 github.com/go-enry/go-license-detector/v4 v4.3.0 + github.com/go-git/go-git/v5 v5.3.0 github.com/golang-jwt/jwt/v4 v4.4.2 github.com/google/go-cmp v0.5.9 github.com/google/uuid v1.3.0 @@ -106,7 +107,6 @@ require ( github.com/go-enry/go-oniguruma v1.2.1 // indirect github.com/go-git/gcfg v1.5.0 // indirect github.com/go-git/go-billy/v5 v5.1.0 // indirect - github.com/go-git/go-git/v5 v5.3.0 // indirect github.com/go-gorp/gorp/v3 v3.0.2 // indirect github.com/go-ole/go-ole v1.2.4 // indirect github.com/godbus/dbus/v5 v5.0.4 // indirect diff --git a/internal/gitaly/linguist/file_instance.go b/internal/gitaly/linguist/file_instance.go index 9b5484bf5..8403c28e4 100644 --- a/internal/gitaly/linguist/file_instance.go +++ b/internal/gitaly/linguist/file_instance.go @@ -3,38 +3,95 @@ package linguist import ( "fmt" "io" + "strings" "github.com/go-enry/go-enry/v2" + "github.com/go-git/go-git/v5/plumbing/format/gitattributes" "gitlab.com/gitlab-org/gitaly/v15/internal/git/gitpipe" ) +const ( + linguistDocumentation = "linguist-documentation" + linguistDetectable = "linguist-detectable" + linguistGenerated = "linguist-generated" + linguistVendored = "linguist-vendored" + linguistLanguage = "linguist-language" +) + type fileInstance struct { filename string + attrs map[string]gitattributes.Attribute } -func newFileInstance(filename string) fileInstance { +func newFileInstance(filename string, attrMatcher gitattributes.Matcher) fileInstance { + attrs, _ := attrMatcher.Match(strings.Split(filename, "/"), + []string{ + linguistDocumentation, + linguistDetectable, + linguistGenerated, + linguistVendored, + linguistLanguage, + }) + return fileInstance{ filename: filename, + attrs: attrs, } } func (f fileInstance) isDocumentation() bool { + attr, ok := f.attrs[linguistDocumentation] + if ok { + if attr.IsUnset() { + return false + } + if attr.IsSet() { + return true + } + } + return enry.IsDocumentation(f.filename) } func (f fileInstance) isVendored() bool { + if attr, ok := f.attrs[linguistVendored]; ok { + if attr.IsUnset() { + return false + } + if attr.IsSet() { + return true + } + } + return enry.IsVendor(f.filename) } func (f fileInstance) isGenerated(content []byte) bool { + if attr, ok := f.attrs[linguistGenerated]; ok { + if attr.IsUnset() { + return false + } + if attr.IsSet() { + return true + } + } + return enry.IsGenerated(f.filename, content) } func (f fileInstance) getLanguage(content []byte) string { + if attr, ok := f.attrs[linguistLanguage]; ok { + return attr.Value() + } + return enry.GetLanguage(f.filename, content) } func (f fileInstance) isIgnoredLanguage(lang string) bool { + if attr, ok := f.attrs[linguistDetectable]; ok { + return !attr.IsSet() + } + // Ignore anything that's neither markup nor a programming language, // similar to what the linguist gem does: // https://github.com/github/linguist/blob/v7.20.0/lib/linguist/blob_helper.rb#L378-L387 diff --git a/internal/gitaly/linguist/linguist.go b/internal/gitaly/linguist/linguist.go index 6152797cc..95610c904 100644 --- a/internal/gitaly/linguist/linguist.go +++ b/internal/gitaly/linguist/linguist.go @@ -8,8 +8,10 @@ import ( "io" "os" "os/exec" + "strings" "github.com/go-enry/go-enry/v2" + "github.com/go-git/go-git/v5/plumbing/format/gitattributes" "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" "gitlab.com/gitlab-org/gitaly/v15/internal/command" "gitlab.com/gitlab-org/gitaly/v15/internal/git" @@ -119,12 +121,17 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount } defer cancel() + attrMatcher, err := inst.newAttrMatcher(ctx, objectReader, commitID) + if err != nil { + return nil, fmt.Errorf("linguist new attribute matcher: %w", err) + } + var revlistIt gitpipe.RevisionIterator if stats.CommitID == "" { skipFunc := func(result *gitpipe.RevisionResult) bool { // Skip files that are an excluded filetype based on filename. - return newFileInstance(string(result.ObjectName)).IsExcluded() + return newFileInstance(string(result.ObjectName), attrMatcher).IsExcluded() } // No existing stats cached, so get all the files for the commit @@ -143,7 +150,7 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount // Skip files that are deleted, or // an excluded filetype based on filename. if git.ObjectHashSHA1.IsZeroOID(result.OID) || - newFileInstance(string(result.ObjectName)).IsExcluded() { + newFileInstance(string(result.ObjectName), attrMatcher).IsExcluded() { // It's a little bit of a hack to use this skip // function, but for every file that's deleted, // remove the stats. @@ -170,7 +177,7 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount object := objectIt.Result() filename := string(object.ObjectName) - lang, size, err := newFileInstance(filename).DetermineStats(object) + lang, size, err := newFileInstance(filename, attrMatcher).DetermineStats(object) if err != nil { return nil, fmt.Errorf("linguist determine stats: %w", err) } @@ -199,3 +206,31 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount return stats.Totals, nil } + +func (inst *Instance) newAttrMatcher(ctx context.Context, objectReader catfile.ObjectReader, commitID string) (gitattributes.Matcher, error) { + var gitattrObject io.Reader + var err error + + gitattrObject, err = objectReader.Object(ctx, git.Revision(commitID+":.gitattributes")) + if catfile.IsNotFound(err) { + gitattrObject = strings.NewReader("") + } else if err != nil { + return nil, fmt.Errorf("read .gitattributes: %w", err) + } + + attrs, err := gitattributes.ReadAttributes(gitattrObject, nil, true) + if err != nil { + return nil, fmt.Errorf("read attr: %w", err) + } + + // Reverse the slice because of a bug in go-git, see + // https://github.com/go-git/go-git/pull/585 + attrsLen := len(attrs) + attrsMid := attrsLen / 2 + for i := 0; i < attrsMid; i++ { + j := attrsLen - i - 1 + attrs[i], attrs[j] = attrs[j], attrs[i] + } + + return gitattributes.NewMatcher(attrs), nil +} diff --git a/internal/gitaly/linguist/linguist_test.go b/internal/gitaly/linguist/linguist_test.go index 8461b2e56..c03f96fba 100644 --- a/internal/gitaly/linguist/linguist_test.go +++ b/internal/gitaly/linguist/linguist_test.go @@ -92,6 +92,31 @@ func testInstanceStats(t *testing.T, ctx context.Context) { }, }, { + desc: "documentation with overrides", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + docTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + {Path: "readme.md", Mode: "100644", Content: strings.Repeat("a", 500)}, + {Path: "index.html", Mode: "100644", Content: strings.Repeat("a", 120)}, + {Path: "formatter.rb", Mode: "100644", Content: strings.Repeat("a", 403)}, + }) + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "docs", Mode: "040000", OID: docTree}, + gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)}, + gittest.TreeEntry{Path: ".gitattributes", Mode: "100644", Content: "formatter.rb -linguist-documentation"}, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "C": 85, + "Ruby": 403, + }, + }, + { desc: "vendor is ignored", setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ @@ -113,6 +138,29 @@ func testInstanceStats(t *testing.T, ctx context.Context) { }, }, { + desc: "vendor with overrides", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + vendorTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + {Path: "app.rb", Mode: "100644", Content: strings.Repeat("a", 500)}, + }) + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "vendor", Mode: "040000", OID: vendorTree}, + gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)}, + gittest.TreeEntry{Path: ".gitattributes", Mode: "100644", Content: "*.rb -linguist-vendored"}, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "C": 85, + "Ruby": 500, + }, + }, + { desc: "generated is ignored", setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ @@ -134,6 +182,29 @@ func testInstanceStats(t *testing.T, ctx context.Context) { }, }, { + desc: "generated with overrides", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + podsTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + {Path: "app.swift", Mode: "100644", Content: strings.Repeat("a", 500)}, + }) + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "Pods", Mode: "040000", OID: podsTree}, + gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)}, + gittest.TreeEntry{Path: ".gitattributes", Mode: "100644", Content: "Pods/* -linguist-generated"}, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "C": 85, + "Swift": 500, + }, + }, + { desc: "undetectable languages are ignored", setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ @@ -153,6 +224,124 @@ func testInstanceStats(t *testing.T, ctx context.Context) { }, }, { + desc: "undetectable languages with overrides", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "config.json", Mode: "100644", Content: strings.Repeat("a", 234)}, + gittest.TreeEntry{Path: "manual.md", Mode: "100644", Content: strings.Repeat("a", 553)}, + gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)}, + gittest.TreeEntry{ + Path: ".gitattributes", + Mode: "100644", + Content: "*.md linguist-detectable\n" + + "*.json linguist-detectable\n", + }, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "C": 85, + "JSON": 234, + "Markdown": 553, + }, + }, + { + desc: "file specific documentation override", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + docTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + {Path: "readme.md", Mode: "100644", Content: strings.Repeat("a", 500)}, + {Path: "index.html", Mode: "100644", Content: strings.Repeat("a", 120)}, + {Path: "formatter.rb", Mode: "100644", Content: strings.Repeat("a", 403)}, + }) + + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "docu", Mode: "040000", OID: docTree}, + gittest.TreeEntry{ + Path: ".gitattributes", + Mode: "100644", + Content: "docu/* linguist-documentation\n" + + "docu/formatter.rb -linguist-documentation", + }, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "Ruby": 403, + }, + }, + { + desc: "detectable overrides", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "keeb.kicad_pcb", Mode: "100644", Content: strings.Repeat("a", 500)}, + gittest.TreeEntry{Path: "keeb.sch", Mode: "100644", Content: strings.Repeat("a", 120)}, + gittest.TreeEntry{Path: "export_bom.py", Mode: "100644", Content: strings.Repeat("a", 403)}, + gittest.TreeEntry{ + Path: ".gitattributes", + Mode: "100644", + Content: "*.kicad_pcb linguist-detectable\n" + + "*.sch linguist-detectable\n" + + "export_bom.py -linguist-detectable", + }, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "KiCad Layout": 500, + "XML": 120, + }, + }, + { + desc: "double star file pattern documentation override", + setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { + repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + subSubTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + {Path: "first.rb", Mode: "100644", Content: strings.Repeat("a", 483)}, + {Path: "second.rb", Mode: "100644", Content: strings.Repeat("a", 888)}, + }) + + subTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + {Path: "main.rb", Mode: "100644", Content: strings.Repeat("a", 500)}, + {Path: "formatter.rb", Mode: "100644", Content: strings.Repeat("a", 120)}, + {Path: "example", Mode: "040000", OID: subSubTree}, + }) + + commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries( + gittest.TreeEntry{Path: "scripts", Mode: "040000", OID: subTree}, + gittest.TreeEntry{Path: "run.rb", Mode: "100644", Content: strings.Repeat("a", 55)}, + gittest.TreeEntry{ + Path: ".gitattributes", + Mode: "100644", + Content: "scripts/** linguist-documentation\n" + + "scripts/formatter.rb -linguist-documentation", + }, + )) + + return repoProto, repoPath, commitID + }, + expectedStats: ByteCountPerLanguage{ + "Ruby": 175, + }, + }, + { desc: "empty code files", setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) { repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{ |