Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorToon Claes <toon@gitlab.com>2023-02-01 17:31:17 +0300
committerToon Claes <toon@gitlab.com>2023-02-01 18:55:28 +0300
commit23743a961c02c53a22584e4d5baef3cb273d19e7 (patch)
tree220a4a7992779c9839d0c9c643ddbd4e01d9b3e7
parentf6cedf746781d9cc0147ff9f90fb3a6703bdf81b (diff)
Draft: Rewrite using google/licenseclassifiertoon-google-license
Issue: https://gitlab.com/gitlab-org/gitaly/-/issues/4761
-rw-r--r--internal/gitaly/service/repository/license.go178
1 files changed, 57 insertions, 121 deletions
diff --git a/internal/gitaly/service/repository/license.go b/internal/gitaly/service/repository/license.go
index 3f3e638fd..0fa3914b1 100644
--- a/internal/gitaly/service/repository/license.go
+++ b/internal/gitaly/service/repository/license.go
@@ -6,13 +6,10 @@ import (
"errors"
"fmt"
"io"
- "regexp"
"sort"
- "strings"
- "github.com/go-enry/go-license-detector/v4/licensedb"
- "github.com/go-enry/go-license-detector/v4/licensedb/api"
- "github.com/go-enry/go-license-detector/v4/licensedb/filer"
+ classifier "github.com/google/licenseclassifier/v2"
+ classifierassets "github.com/google/licenseclassifier/v2/assets"
log "github.com/sirupsen/logrus"
"gitlab.com/gitlab-org/gitaly/v15/internal/git"
"gitlab.com/gitlab-org/gitaly/v15/internal/git/localrepo"
@@ -42,11 +39,16 @@ var nicknameByLicenseIdentifier = map[string]string{
"gpl-2.0": "GNU GPLv2",
}
-// PreloadLicenseDatabase will warm up the license database, otherwise
-// the first call to `licensedb.Detect` could take too long:
-// https://github.com/go-enry/go-license-detector/issues/13
+var globalClassifier *classifier.Classifier
+
+// PreloadLicenseDatabase will warm up the license database.
func PreloadLicenseDatabase() error {
- licensedb.Preload()
+ var err error
+ globalClassifier, err = classifierassets.DefaultClassifier()
+ if err != nil {
+ return err
+ }
+
log.Info("License database preloaded")
return nil
@@ -97,132 +99,76 @@ func (s *server) FindLicense(ctx context.Context, req *gitalypb.FindLicenseReque
func findLicense(ctx context.Context, repo *localrepo.Repo, commitID git.ObjectID) (*gitalypb.FindLicenseResponse, error) {
repoFiler := &gitFiler{ctx: ctx, repo: repo, treeishID: commitID}
- detectedLicenses, err := licensedb.Detect(repoFiler)
+
+ files, err := repoFiler.ReadDir("")
if err != nil {
- if errors.Is(err, licensedb.ErrNoLicenseFound) {
- if repoFiler.foundLicense {
- // In case the license is not identified, but a file containing some
- // sort of license is found, we return a predefined response.
- return &gitalypb.FindLicenseResponse{
- LicenseName: "Other",
- LicenseShortName: "other",
- LicenseNickname: "LICENSE", // Show as LICENSE in the UI
- LicensePath: repoFiler.path,
- }, nil
- }
- return &gitalypb.FindLicenseResponse{}, nil
- }
- return nil, structerr.NewInternal("detect licenses: %w", err)
+ return nil, fmt.Errorf("read root dir: %w", err)
}
- type bestMatch struct {
- shortName string
- api.Match
+ objectReader, cancel, err := repo.ObjectReader(ctx)
+ if err != nil {
+ return nil, fmt.Errorf("create object reader: %w", err)
}
- bestMatches := make([]bestMatch, 0, len(detectedLicenses))
- for candidate, match := range detectedLicenses {
- _, err := licensedb.LicenseName(trimDeprecatedPrefix(candidate))
+ defer cancel()
+
+ var matches classifier.Matches
+ for _, f := range files {
+ rev := git.Revision(fmt.Sprintf("%s:%s", commitID, f))
+ // oid := git.ObjectID(rev.String())
+
+ // data, err := repo.ReadObject(ctx, oid)
+ // if err != nil {
+ // return nil, fmt.Errorf("read object %v: %w", rev.String(), err)
+ // }
+ // results := globalClassifier.Match(data)
+
+ object, err := objectReader.Object(ctx, rev)
if err != nil {
- if errors.Is(err, licensedb.ErrUnknownLicenseID) {
- continue
- }
- return nil, structerr.NewInternal("license name by id %q: %w", candidate, err)
+ return nil, fmt.Errorf("read file %v: %w", f, err)
}
- bestMatches = append(bestMatches, bestMatch{Match: match, shortName: candidate})
- }
-
- if len(bestMatches) == 0 {
- return &gitalypb.FindLicenseResponse{}, nil
+ results, err := globalClassifier.MatchFrom(object)
+ if err != nil {
+ return nil, fmt.Errorf("classify %v: %w", f, err)
+ }
+ matches = append(matches, results.Matches...)
}
-
- sort.Slice(bestMatches, func(i, j int) bool {
+ sort.Slice(matches, func(i, j int) bool {
// Because there could be multiple matches with the same confidence, we need
// to make sure the function is consistent and returns the same license on
// each invocation. That is why we sort by the short name as well.
- if bestMatches[i].Confidence == bestMatches[j].Confidence {
- return trimDeprecatedPrefix(bestMatches[i].shortName) < trimDeprecatedPrefix(bestMatches[j].shortName)
+ if matches[i].Confidence == matches[j].Confidence {
+ return matches[i].Name < matches[j].Name
}
- return bestMatches[i].Confidence > bestMatches[j].Confidence
+ return matches[i].Confidence > matches[j].Confidence
})
-
- // We also don't want to return the prefix back to the caller if it exists.
- shortName := trimDeprecatedPrefix(bestMatches[0].shortName)
-
- name, err := licensedb.LicenseName(shortName)
- if err != nil {
- return nil, structerr.NewInternal("license name by id %q: %w", shortName, err)
+ if len(matches) == 0 {
+ return &gitalypb.FindLicenseResponse{}, nil
}
- urls, err := licensedb.LicenseURLs(shortName)
- if err != nil {
- return nil, structerr.NewInternal("license URLs by id %q: %w", shortName, err)
- }
- var url string
- if len(urls) > 0 {
- // The URL list is returned in an ordered slice, so we just pick up the first one from the list.
- url = urls[0]
- }
+ bestMatch := matches[0]
- // The license identifier used by `github.com/go-enry/go-license-detector` is
- // case-sensitive, but the API requires all license identifiers to be lower-cased.
- shortName = strings.ToLower(shortName)
- nickname := nicknameByLicenseIdentifier[shortName]
+ // TODO https://github.com/spdx/license-list-data/tree/main/json
return &gitalypb.FindLicenseResponse{
- LicenseShortName: shortName,
- LicensePath: bestMatches[0].File,
- LicenseName: name,
- LicenseUrl: url,
- LicenseNickname: nickname,
+ LicenseShortName: bestMatch.Name,
+ LicensePath: bestMatch.Name,
+ LicenseName: bestMatch.Name,
+ LicenseUrl: bestMatch.Name,
+ LicenseNickname: bestMatch.Name,
}, nil
}
-// For the deprecated licenses, the `github.com/go-enry/go-license-detector` package
-// uses the "deprecated_" prefix in the identifier. But the license database stores
-// information using the identifier without prefix, so we need to cut off the
-// prefix before searching for full license name and license URLs.
-func trimDeprecatedPrefix(name string) string {
- return strings.TrimPrefix(name, "deprecated_")
-}
-
-var readmeRegexp = regexp.MustCompile(`(readme|guidelines)(\.md|\.rst|\.html|\.txt)?$`)
-
type gitFiler struct {
- ctx context.Context
- repo *localrepo.Repo
- foundLicense bool
- path string
- treeishID git.ObjectID
+ ctx context.Context
+ repo *localrepo.Repo
+ treeishID git.ObjectID
}
-func (f *gitFiler) ReadFile(path string) ([]byte, error) {
- data, err := f.repo.ReadObject(f.ctx, git.ObjectID(fmt.Sprintf("%s:%s", f.treeishID, path)))
- if err != nil {
- return nil, fmt.Errorf("read file: %w", err)
- }
-
- // `licensedb.Detect` only opens files that look like licenses. Failing that, it will
- // also open readme files to try to identify license files. The RPC handler needs the
- // knowledge of whether any license files were encountered, so we filter out the
- // readme files as defined in licensedb.Detect:
- // https://github.com/go-enry/go-license-detector/blob/4f2ca6af2ab943d9b5fa3a02782eebc06f79a5f4/licensedb/internal/investigation.go#L61
- //
- // This doesn't filter out the possible license files identified from the readme files which may in fact not
- // be licenses.
- if !f.foundLicense {
- f.foundLicense = !readmeRegexp.MatchString(strings.ToLower(path))
- if f.foundLicense {
- f.path = path
- }
- }
-
- return data, nil
-}
-
-func (f *gitFiler) ReadDir(string) ([]filer.File, error) {
+func (f *gitFiler) ReadDir(string) ([]string, error) {
var stderr bytes.Buffer
cmd, err := f.repo.Exec(f.ctx, git.Command{
Name: "ls-tree",
Flags: []git.Option{
+ //git.Flag{Name: "--full-tree"},
git.Flag{Name: "-z"},
},
Args: []string{f.treeishID.String()},
@@ -233,7 +179,7 @@ func (f *gitFiler) ReadDir(string) ([]filer.File, error) {
tree := lstree.NewParser(cmd, git.ObjectHashSHA1)
- var files []filer.File
+ var files []string
for {
entry, err := tree.NextEntry()
if err != nil {
@@ -247,10 +193,7 @@ func (f *gitFiler) ReadDir(string) ([]filer.File, error) {
continue
}
- files = append(files, filer.File{
- Name: entry.Path,
- IsDir: false,
- })
+ files = append(files, entry.Path)
}
if err := cmd.Wait(); err != nil {
@@ -259,10 +202,3 @@ func (f *gitFiler) ReadDir(string) ([]filer.File, error) {
return files, nil
}
-
-func (f *gitFiler) Close() {}
-
-func (f *gitFiler) PathsAreAlwaysSlash() bool {
- // git ls-files uses unix slash `/`
- return true
-}