Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorToon Claes <toon@gitlab.com>2022-09-20 20:58:55 +0300
committerToon Claes <toon@gitlab.com>2022-10-03 16:26:16 +0300
commit2ae7de24d887eb5309420c151bdacdbfd8f880aa (patch)
tree3d9c497d32887d5df67945db3055379427e09027
parentbad57b12f2234d3d1246dce658282d34a7b8f186 (diff)
linguist: Take .gitattributes into account
Users can override [1] the linguist behavior by writing things in gitattributes(5). This change copies that behavior into the go implementation. [1]: https://github.com/github/linguist/blob/master/docs/overrides.md
-rw-r--r--go.mod2
-rw-r--r--internal/gitaly/linguist/file_instance.go59
-rw-r--r--internal/gitaly/linguist/linguist.go41
-rw-r--r--internal/gitaly/linguist/linguist_test.go189
4 files changed, 286 insertions, 5 deletions
diff --git a/go.mod b/go.mod
index 11fb7c8f4..876f75f0b 100644
--- a/go.mod
+++ b/go.mod
@@ -14,6 +14,7 @@ require (
github.com/git-lfs/git-lfs/v3 v3.2.0
github.com/go-enry/go-enry/v2 v2.8.2
github.com/go-enry/go-license-detector/v4 v4.3.0
+ github.com/go-git/go-git/v5 v5.3.0
github.com/golang-jwt/jwt/v4 v4.4.2
github.com/google/go-cmp v0.5.9
github.com/google/uuid v1.3.0
@@ -106,7 +107,6 @@ require (
github.com/go-enry/go-oniguruma v1.2.1 // indirect
github.com/go-git/gcfg v1.5.0 // indirect
github.com/go-git/go-billy/v5 v5.1.0 // indirect
- github.com/go-git/go-git/v5 v5.3.0 // indirect
github.com/go-gorp/gorp/v3 v3.0.2 // indirect
github.com/go-ole/go-ole v1.2.4 // indirect
github.com/godbus/dbus/v5 v5.0.4 // indirect
diff --git a/internal/gitaly/linguist/file_instance.go b/internal/gitaly/linguist/file_instance.go
index 9b5484bf5..8403c28e4 100644
--- a/internal/gitaly/linguist/file_instance.go
+++ b/internal/gitaly/linguist/file_instance.go
@@ -3,38 +3,95 @@ package linguist
import (
"fmt"
"io"
+ "strings"
"github.com/go-enry/go-enry/v2"
+ "github.com/go-git/go-git/v5/plumbing/format/gitattributes"
"gitlab.com/gitlab-org/gitaly/v15/internal/git/gitpipe"
)
+const (
+ linguistDocumentation = "linguist-documentation"
+ linguistDetectable = "linguist-detectable"
+ linguistGenerated = "linguist-generated"
+ linguistVendored = "linguist-vendored"
+ linguistLanguage = "linguist-language"
+)
+
type fileInstance struct {
filename string
+ attrs map[string]gitattributes.Attribute
}
-func newFileInstance(filename string) fileInstance {
+func newFileInstance(filename string, attrMatcher gitattributes.Matcher) fileInstance {
+ attrs, _ := attrMatcher.Match(strings.Split(filename, "/"),
+ []string{
+ linguistDocumentation,
+ linguistDetectable,
+ linguistGenerated,
+ linguistVendored,
+ linguistLanguage,
+ })
+
return fileInstance{
filename: filename,
+ attrs: attrs,
}
}
func (f fileInstance) isDocumentation() bool {
+ attr, ok := f.attrs[linguistDocumentation]
+ if ok {
+ if attr.IsUnset() {
+ return false
+ }
+ if attr.IsSet() {
+ return true
+ }
+ }
+
return enry.IsDocumentation(f.filename)
}
func (f fileInstance) isVendored() bool {
+ if attr, ok := f.attrs[linguistVendored]; ok {
+ if attr.IsUnset() {
+ return false
+ }
+ if attr.IsSet() {
+ return true
+ }
+ }
+
return enry.IsVendor(f.filename)
}
func (f fileInstance) isGenerated(content []byte) bool {
+ if attr, ok := f.attrs[linguistGenerated]; ok {
+ if attr.IsUnset() {
+ return false
+ }
+ if attr.IsSet() {
+ return true
+ }
+ }
+
return enry.IsGenerated(f.filename, content)
}
func (f fileInstance) getLanguage(content []byte) string {
+ if attr, ok := f.attrs[linguistLanguage]; ok {
+ return attr.Value()
+ }
+
return enry.GetLanguage(f.filename, content)
}
func (f fileInstance) isIgnoredLanguage(lang string) bool {
+ if attr, ok := f.attrs[linguistDetectable]; ok {
+ return !attr.IsSet()
+ }
+
// Ignore anything that's neither markup nor a programming language,
// similar to what the linguist gem does:
// https://github.com/github/linguist/blob/v7.20.0/lib/linguist/blob_helper.rb#L378-L387
diff --git a/internal/gitaly/linguist/linguist.go b/internal/gitaly/linguist/linguist.go
index 6152797cc..95610c904 100644
--- a/internal/gitaly/linguist/linguist.go
+++ b/internal/gitaly/linguist/linguist.go
@@ -8,8 +8,10 @@ import (
"io"
"os"
"os/exec"
+ "strings"
"github.com/go-enry/go-enry/v2"
+ "github.com/go-git/go-git/v5/plumbing/format/gitattributes"
"github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus"
"gitlab.com/gitlab-org/gitaly/v15/internal/command"
"gitlab.com/gitlab-org/gitaly/v15/internal/git"
@@ -119,12 +121,17 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount
}
defer cancel()
+ attrMatcher, err := inst.newAttrMatcher(ctx, objectReader, commitID)
+ if err != nil {
+ return nil, fmt.Errorf("linguist new attribute matcher: %w", err)
+ }
+
var revlistIt gitpipe.RevisionIterator
if stats.CommitID == "" {
skipFunc := func(result *gitpipe.RevisionResult) bool {
// Skip files that are an excluded filetype based on filename.
- return newFileInstance(string(result.ObjectName)).IsExcluded()
+ return newFileInstance(string(result.ObjectName), attrMatcher).IsExcluded()
}
// No existing stats cached, so get all the files for the commit
@@ -143,7 +150,7 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount
// Skip files that are deleted, or
// an excluded filetype based on filename.
if git.ObjectHashSHA1.IsZeroOID(result.OID) ||
- newFileInstance(string(result.ObjectName)).IsExcluded() {
+ newFileInstance(string(result.ObjectName), attrMatcher).IsExcluded() {
// It's a little bit of a hack to use this skip
// function, but for every file that's deleted,
// remove the stats.
@@ -170,7 +177,7 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount
object := objectIt.Result()
filename := string(object.ObjectName)
- lang, size, err := newFileInstance(filename).DetermineStats(object)
+ lang, size, err := newFileInstance(filename, attrMatcher).DetermineStats(object)
if err != nil {
return nil, fmt.Errorf("linguist determine stats: %w", err)
}
@@ -199,3 +206,31 @@ func (inst *Instance) enryStats(ctx context.Context, commitID string) (ByteCount
return stats.Totals, nil
}
+
+func (inst *Instance) newAttrMatcher(ctx context.Context, objectReader catfile.ObjectReader, commitID string) (gitattributes.Matcher, error) {
+ var gitattrObject io.Reader
+ var err error
+
+ gitattrObject, err = objectReader.Object(ctx, git.Revision(commitID+":.gitattributes"))
+ if catfile.IsNotFound(err) {
+ gitattrObject = strings.NewReader("")
+ } else if err != nil {
+ return nil, fmt.Errorf("read .gitattributes: %w", err)
+ }
+
+ attrs, err := gitattributes.ReadAttributes(gitattrObject, nil, true)
+ if err != nil {
+ return nil, fmt.Errorf("read attr: %w", err)
+ }
+
+ // Reverse the slice because of a bug in go-git, see
+ // https://github.com/go-git/go-git/pull/585
+ attrsLen := len(attrs)
+ attrsMid := attrsLen / 2
+ for i := 0; i < attrsMid; i++ {
+ j := attrsLen - i - 1
+ attrs[i], attrs[j] = attrs[j], attrs[i]
+ }
+
+ return gitattributes.NewMatcher(attrs), nil
+}
diff --git a/internal/gitaly/linguist/linguist_test.go b/internal/gitaly/linguist/linguist_test.go
index 8461b2e56..c03f96fba 100644
--- a/internal/gitaly/linguist/linguist_test.go
+++ b/internal/gitaly/linguist/linguist_test.go
@@ -92,6 +92,31 @@ func testInstanceStats(t *testing.T, ctx context.Context) {
},
},
{
+ desc: "documentation with overrides",
+ setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ SkipCreationViaService: true,
+ })
+
+ docTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{
+ {Path: "readme.md", Mode: "100644", Content: strings.Repeat("a", 500)},
+ {Path: "index.html", Mode: "100644", Content: strings.Repeat("a", 120)},
+ {Path: "formatter.rb", Mode: "100644", Content: strings.Repeat("a", 403)},
+ })
+ commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries(
+ gittest.TreeEntry{Path: "docs", Mode: "040000", OID: docTree},
+ gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)},
+ gittest.TreeEntry{Path: ".gitattributes", Mode: "100644", Content: "formatter.rb -linguist-documentation"},
+ ))
+
+ return repoProto, repoPath, commitID
+ },
+ expectedStats: ByteCountPerLanguage{
+ "C": 85,
+ "Ruby": 403,
+ },
+ },
+ {
desc: "vendor is ignored",
setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
@@ -113,6 +138,29 @@ func testInstanceStats(t *testing.T, ctx context.Context) {
},
},
{
+ desc: "vendor with overrides",
+ setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ SkipCreationViaService: true,
+ })
+
+ vendorTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{
+ {Path: "app.rb", Mode: "100644", Content: strings.Repeat("a", 500)},
+ })
+ commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries(
+ gittest.TreeEntry{Path: "vendor", Mode: "040000", OID: vendorTree},
+ gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)},
+ gittest.TreeEntry{Path: ".gitattributes", Mode: "100644", Content: "*.rb -linguist-vendored"},
+ ))
+
+ return repoProto, repoPath, commitID
+ },
+ expectedStats: ByteCountPerLanguage{
+ "C": 85,
+ "Ruby": 500,
+ },
+ },
+ {
desc: "generated is ignored",
setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
@@ -134,6 +182,29 @@ func testInstanceStats(t *testing.T, ctx context.Context) {
},
},
{
+ desc: "generated with overrides",
+ setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ SkipCreationViaService: true,
+ })
+
+ podsTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{
+ {Path: "app.swift", Mode: "100644", Content: strings.Repeat("a", 500)},
+ })
+ commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries(
+ gittest.TreeEntry{Path: "Pods", Mode: "040000", OID: podsTree},
+ gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)},
+ gittest.TreeEntry{Path: ".gitattributes", Mode: "100644", Content: "Pods/* -linguist-generated"},
+ ))
+
+ return repoProto, repoPath, commitID
+ },
+ expectedStats: ByteCountPerLanguage{
+ "C": 85,
+ "Swift": 500,
+ },
+ },
+ {
desc: "undetectable languages are ignored",
setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
@@ -153,6 +224,124 @@ func testInstanceStats(t *testing.T, ctx context.Context) {
},
},
{
+ desc: "undetectable languages with overrides",
+ setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ SkipCreationViaService: true,
+ })
+
+ commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries(
+ gittest.TreeEntry{Path: "config.json", Mode: "100644", Content: strings.Repeat("a", 234)},
+ gittest.TreeEntry{Path: "manual.md", Mode: "100644", Content: strings.Repeat("a", 553)},
+ gittest.TreeEntry{Path: "main.c", Mode: "100644", Content: strings.Repeat("a", 85)},
+ gittest.TreeEntry{
+ Path: ".gitattributes",
+ Mode: "100644",
+ Content: "*.md linguist-detectable\n" +
+ "*.json linguist-detectable\n",
+ },
+ ))
+
+ return repoProto, repoPath, commitID
+ },
+ expectedStats: ByteCountPerLanguage{
+ "C": 85,
+ "JSON": 234,
+ "Markdown": 553,
+ },
+ },
+ {
+ desc: "file specific documentation override",
+ setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ SkipCreationViaService: true,
+ })
+
+ docTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{
+ {Path: "readme.md", Mode: "100644", Content: strings.Repeat("a", 500)},
+ {Path: "index.html", Mode: "100644", Content: strings.Repeat("a", 120)},
+ {Path: "formatter.rb", Mode: "100644", Content: strings.Repeat("a", 403)},
+ })
+
+ commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries(
+ gittest.TreeEntry{Path: "docu", Mode: "040000", OID: docTree},
+ gittest.TreeEntry{
+ Path: ".gitattributes",
+ Mode: "100644",
+ Content: "docu/* linguist-documentation\n" +
+ "docu/formatter.rb -linguist-documentation",
+ },
+ ))
+
+ return repoProto, repoPath, commitID
+ },
+ expectedStats: ByteCountPerLanguage{
+ "Ruby": 403,
+ },
+ },
+ {
+ desc: "detectable overrides",
+ setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ SkipCreationViaService: true,
+ })
+
+ commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries(
+ gittest.TreeEntry{Path: "keeb.kicad_pcb", Mode: "100644", Content: strings.Repeat("a", 500)},
+ gittest.TreeEntry{Path: "keeb.sch", Mode: "100644", Content: strings.Repeat("a", 120)},
+ gittest.TreeEntry{Path: "export_bom.py", Mode: "100644", Content: strings.Repeat("a", 403)},
+ gittest.TreeEntry{
+ Path: ".gitattributes",
+ Mode: "100644",
+ Content: "*.kicad_pcb linguist-detectable\n" +
+ "*.sch linguist-detectable\n" +
+ "export_bom.py -linguist-detectable",
+ },
+ ))
+
+ return repoProto, repoPath, commitID
+ },
+ expectedStats: ByteCountPerLanguage{
+ "KiCad Layout": 500,
+ "XML": 120,
+ },
+ },
+ {
+ desc: "double star file pattern documentation override",
+ setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
+ repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{
+ SkipCreationViaService: true,
+ })
+
+ subSubTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{
+ {Path: "first.rb", Mode: "100644", Content: strings.Repeat("a", 483)},
+ {Path: "second.rb", Mode: "100644", Content: strings.Repeat("a", 888)},
+ })
+
+ subTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{
+ {Path: "main.rb", Mode: "100644", Content: strings.Repeat("a", 500)},
+ {Path: "formatter.rb", Mode: "100644", Content: strings.Repeat("a", 120)},
+ {Path: "example", Mode: "040000", OID: subSubTree},
+ })
+
+ commitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTreeEntries(
+ gittest.TreeEntry{Path: "scripts", Mode: "040000", OID: subTree},
+ gittest.TreeEntry{Path: "run.rb", Mode: "100644", Content: strings.Repeat("a", 55)},
+ gittest.TreeEntry{
+ Path: ".gitattributes",
+ Mode: "100644",
+ Content: "scripts/** linguist-documentation\n" +
+ "scripts/formatter.rb -linguist-documentation",
+ },
+ ))
+
+ return repoProto, repoPath, commitID
+ },
+ expectedStats: ByteCountPerLanguage{
+ "Ruby": 175,
+ },
+ },
+ {
desc: "empty code files",
setup: func(t *testing.T) (*gitalypb.Repository, string, git.ObjectID) {
repoProto, repoPath := gittest.CreateRepository(ctx, t, cfg, gittest.CreateRepositoryConfig{