diff options
author | Toon Claes <toon@gitlab.com> | 2023-02-01 17:31:17 +0300 |
---|---|---|
committer | Toon Claes <toon@gitlab.com> | 2023-02-01 18:55:28 +0300 |
commit | 23743a961c02c53a22584e4d5baef3cb273d19e7 (patch) | |
tree | 220a4a7992779c9839d0c9c643ddbd4e01d9b3e7 | |
parent | f6cedf746781d9cc0147ff9f90fb3a6703bdf81b (diff) |
Draft: Rewrite using google/licenseclassifiertoon-google-license
Issue: https://gitlab.com/gitlab-org/gitaly/-/issues/4761
-rw-r--r-- | internal/gitaly/service/repository/license.go | 178 |
1 files changed, 57 insertions, 121 deletions
diff --git a/internal/gitaly/service/repository/license.go b/internal/gitaly/service/repository/license.go index 3f3e638fd..0fa3914b1 100644 --- a/internal/gitaly/service/repository/license.go +++ b/internal/gitaly/service/repository/license.go @@ -6,13 +6,10 @@ import ( "errors" "fmt" "io" - "regexp" "sort" - "strings" - "github.com/go-enry/go-license-detector/v4/licensedb" - "github.com/go-enry/go-license-detector/v4/licensedb/api" - "github.com/go-enry/go-license-detector/v4/licensedb/filer" + classifier "github.com/google/licenseclassifier/v2" + classifierassets "github.com/google/licenseclassifier/v2/assets" log "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/gitaly/v15/internal/git" "gitlab.com/gitlab-org/gitaly/v15/internal/git/localrepo" @@ -42,11 +39,16 @@ var nicknameByLicenseIdentifier = map[string]string{ "gpl-2.0": "GNU GPLv2", } -// PreloadLicenseDatabase will warm up the license database, otherwise -// the first call to `licensedb.Detect` could take too long: -// https://github.com/go-enry/go-license-detector/issues/13 +var globalClassifier *classifier.Classifier + +// PreloadLicenseDatabase will warm up the license database. func PreloadLicenseDatabase() error { - licensedb.Preload() + var err error + globalClassifier, err = classifierassets.DefaultClassifier() + if err != nil { + return err + } + log.Info("License database preloaded") return nil @@ -97,132 +99,76 @@ func (s *server) FindLicense(ctx context.Context, req *gitalypb.FindLicenseReque func findLicense(ctx context.Context, repo *localrepo.Repo, commitID git.ObjectID) (*gitalypb.FindLicenseResponse, error) { repoFiler := &gitFiler{ctx: ctx, repo: repo, treeishID: commitID} - detectedLicenses, err := licensedb.Detect(repoFiler) + + files, err := repoFiler.ReadDir("") if err != nil { - if errors.Is(err, licensedb.ErrNoLicenseFound) { - if repoFiler.foundLicense { - // In case the license is not identified, but a file containing some - // sort of license is found, we return a predefined response. - return &gitalypb.FindLicenseResponse{ - LicenseName: "Other", - LicenseShortName: "other", - LicenseNickname: "LICENSE", // Show as LICENSE in the UI - LicensePath: repoFiler.path, - }, nil - } - return &gitalypb.FindLicenseResponse{}, nil - } - return nil, structerr.NewInternal("detect licenses: %w", err) + return nil, fmt.Errorf("read root dir: %w", err) } - type bestMatch struct { - shortName string - api.Match + objectReader, cancel, err := repo.ObjectReader(ctx) + if err != nil { + return nil, fmt.Errorf("create object reader: %w", err) } - bestMatches := make([]bestMatch, 0, len(detectedLicenses)) - for candidate, match := range detectedLicenses { - _, err := licensedb.LicenseName(trimDeprecatedPrefix(candidate)) + defer cancel() + + var matches classifier.Matches + for _, f := range files { + rev := git.Revision(fmt.Sprintf("%s:%s", commitID, f)) + // oid := git.ObjectID(rev.String()) + + // data, err := repo.ReadObject(ctx, oid) + // if err != nil { + // return nil, fmt.Errorf("read object %v: %w", rev.String(), err) + // } + // results := globalClassifier.Match(data) + + object, err := objectReader.Object(ctx, rev) if err != nil { - if errors.Is(err, licensedb.ErrUnknownLicenseID) { - continue - } - return nil, structerr.NewInternal("license name by id %q: %w", candidate, err) + return nil, fmt.Errorf("read file %v: %w", f, err) } - bestMatches = append(bestMatches, bestMatch{Match: match, shortName: candidate}) - } - - if len(bestMatches) == 0 { - return &gitalypb.FindLicenseResponse{}, nil + results, err := globalClassifier.MatchFrom(object) + if err != nil { + return nil, fmt.Errorf("classify %v: %w", f, err) + } + matches = append(matches, results.Matches...) } - - sort.Slice(bestMatches, func(i, j int) bool { + sort.Slice(matches, func(i, j int) bool { // Because there could be multiple matches with the same confidence, we need // to make sure the function is consistent and returns the same license on // each invocation. That is why we sort by the short name as well. - if bestMatches[i].Confidence == bestMatches[j].Confidence { - return trimDeprecatedPrefix(bestMatches[i].shortName) < trimDeprecatedPrefix(bestMatches[j].shortName) + if matches[i].Confidence == matches[j].Confidence { + return matches[i].Name < matches[j].Name } - return bestMatches[i].Confidence > bestMatches[j].Confidence + return matches[i].Confidence > matches[j].Confidence }) - - // We also don't want to return the prefix back to the caller if it exists. - shortName := trimDeprecatedPrefix(bestMatches[0].shortName) - - name, err := licensedb.LicenseName(shortName) - if err != nil { - return nil, structerr.NewInternal("license name by id %q: %w", shortName, err) + if len(matches) == 0 { + return &gitalypb.FindLicenseResponse{}, nil } - urls, err := licensedb.LicenseURLs(shortName) - if err != nil { - return nil, structerr.NewInternal("license URLs by id %q: %w", shortName, err) - } - var url string - if len(urls) > 0 { - // The URL list is returned in an ordered slice, so we just pick up the first one from the list. - url = urls[0] - } + bestMatch := matches[0] - // The license identifier used by `github.com/go-enry/go-license-detector` is - // case-sensitive, but the API requires all license identifiers to be lower-cased. - shortName = strings.ToLower(shortName) - nickname := nicknameByLicenseIdentifier[shortName] + // TODO https://github.com/spdx/license-list-data/tree/main/json return &gitalypb.FindLicenseResponse{ - LicenseShortName: shortName, - LicensePath: bestMatches[0].File, - LicenseName: name, - LicenseUrl: url, - LicenseNickname: nickname, + LicenseShortName: bestMatch.Name, + LicensePath: bestMatch.Name, + LicenseName: bestMatch.Name, + LicenseUrl: bestMatch.Name, + LicenseNickname: bestMatch.Name, }, nil } -// For the deprecated licenses, the `github.com/go-enry/go-license-detector` package -// uses the "deprecated_" prefix in the identifier. But the license database stores -// information using the identifier without prefix, so we need to cut off the -// prefix before searching for full license name and license URLs. -func trimDeprecatedPrefix(name string) string { - return strings.TrimPrefix(name, "deprecated_") -} - -var readmeRegexp = regexp.MustCompile(`(readme|guidelines)(\.md|\.rst|\.html|\.txt)?$`) - type gitFiler struct { - ctx context.Context - repo *localrepo.Repo - foundLicense bool - path string - treeishID git.ObjectID + ctx context.Context + repo *localrepo.Repo + treeishID git.ObjectID } -func (f *gitFiler) ReadFile(path string) ([]byte, error) { - data, err := f.repo.ReadObject(f.ctx, git.ObjectID(fmt.Sprintf("%s:%s", f.treeishID, path))) - if err != nil { - return nil, fmt.Errorf("read file: %w", err) - } - - // `licensedb.Detect` only opens files that look like licenses. Failing that, it will - // also open readme files to try to identify license files. The RPC handler needs the - // knowledge of whether any license files were encountered, so we filter out the - // readme files as defined in licensedb.Detect: - // https://github.com/go-enry/go-license-detector/blob/4f2ca6af2ab943d9b5fa3a02782eebc06f79a5f4/licensedb/internal/investigation.go#L61 - // - // This doesn't filter out the possible license files identified from the readme files which may in fact not - // be licenses. - if !f.foundLicense { - f.foundLicense = !readmeRegexp.MatchString(strings.ToLower(path)) - if f.foundLicense { - f.path = path - } - } - - return data, nil -} - -func (f *gitFiler) ReadDir(string) ([]filer.File, error) { +func (f *gitFiler) ReadDir(string) ([]string, error) { var stderr bytes.Buffer cmd, err := f.repo.Exec(f.ctx, git.Command{ Name: "ls-tree", Flags: []git.Option{ + //git.Flag{Name: "--full-tree"}, git.Flag{Name: "-z"}, }, Args: []string{f.treeishID.String()}, @@ -233,7 +179,7 @@ func (f *gitFiler) ReadDir(string) ([]filer.File, error) { tree := lstree.NewParser(cmd, git.ObjectHashSHA1) - var files []filer.File + var files []string for { entry, err := tree.NextEntry() if err != nil { @@ -247,10 +193,7 @@ func (f *gitFiler) ReadDir(string) ([]filer.File, error) { continue } - files = append(files, filer.File{ - Name: entry.Path, - IsDir: false, - }) + files = append(files, entry.Path) } if err := cmd.Wait(); err != nil { @@ -259,10 +202,3 @@ func (f *gitFiler) ReadDir(string) ([]filer.File, error) { return files, nil } - -func (f *gitFiler) Close() {} - -func (f *gitFiler) PathsAreAlwaysSlash() bool { - // git ls-files uses unix slash `/` - return true -} |