diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-05-20 09:05:52 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-05-23 10:23:12 +0300 |
commit | 41fe00cab752de8701a0bc4f86866a30b9157d45 (patch) | |
tree | f3b24549ba18b6334a63f302ddfc6e87d61d0826 | |
parent | 2c70420ae66af132e431dbe9402c496edcea54da (diff) |
linguist: Implement wrapper to ignore gitconfig in Rugged
We're using the `git-linguist` binary to derive programming-language
statics for a repository. This binary is using Rugged to read a given
commit and analyze all blobs referenced by the root tree so that it can
return accumulated lines of counts for every language.
Unfortunately, `git-linguist` reads in gitconfig files by default with
no escape hatch, which sabotages our efforts to get gitconfig-clean in
the Gitaly codebase. We are thus forced to implement our own wrapper
script around the Linguist Gem that allows us to ignore the gitconfig in
Rugged.
Do so and implement a new `gitaly-linguist` binary that mostly mirrors
what `git-linguist` is doing. This allows us to override the Rugged
search path and point it to `/dev/null` so that it won't read any
gitconfig files at all.
Note that we do not use e.g. Gitaly's own gitconfig as computed by the
Git command factory. Ultimately, the Linguist Gem does not read any Git
configuration that would change its behaviour, so it would be overkill
to do so.
Changelog: changed
-rw-r--r-- | internal/gitaly/linguist/linguist.go | 12 | ||||
-rw-r--r-- | internal/gitaly/linguist/linguist_test.go | 125 | ||||
-rwxr-xr-x | ruby/bin/gitaly-linguist | 86 |
3 files changed, 209 insertions, 14 deletions
diff --git a/internal/gitaly/linguist/linguist.go b/internal/gitaly/linguist/linguist.go index 7976ddfb5..39ac845c0 100644 --- a/internal/gitaly/linguist/linguist.go +++ b/internal/gitaly/linguist/linguist.go @@ -54,7 +54,7 @@ func New(cfg config.Cfg, gitCmdFactory git.CommandFactory) (*Instance, error) { // Stats returns the repository's language stats as reported by 'git-linguist'. func (inst *Instance) Stats(ctx context.Context, repoPath string, commitID string) (ByteCountPerLanguage, error) { - cmd, err := inst.startGitLinguist(ctx, repoPath, commitID, "stats") + cmd, err := inst.startGitLinguist(ctx, repoPath, commitID) if err != nil { return nil, fmt.Errorf("starting linguist: %w", err) } @@ -86,7 +86,7 @@ func (inst *Instance) Color(language string) string { return fmt.Sprintf("#%x", colorSha[0:3]) } -func (inst *Instance) startGitLinguist(ctx context.Context, repoPath string, commitID string, linguistCommand string) (*command.Command, error) { +func (inst *Instance) startGitLinguist(ctx context.Context, repoPath string, commitID string) (*command.Command, error) { bundle, err := exec.LookPath("bundle") if err != nil { return nil, fmt.Errorf("finding bundle executable: %w", err) @@ -95,11 +95,9 @@ func (inst *Instance) startGitLinguist(ctx context.Context, repoPath string, com args := []string{ bundle, "exec", - "bin/ruby-cd", - repoPath, - "git-linguist", + "bin/gitaly-linguist", + "--repository=" + repoPath, "--commit=" + commitID, - linguistCommand, } gitExecEnv := inst.gitCmdFactory.GetExecutionEnvironment(ctx) @@ -128,7 +126,7 @@ func (inst *Instance) startGitLinguist(ctx context.Context, repoPath string, com } internalCmd.SetMetricsCmd("git-linguist") - internalCmd.SetMetricsSubCmd(linguistCommand) + internalCmd.SetMetricsSubCmd("stats") return internalCmd, nil } diff --git a/internal/gitaly/linguist/linguist_test.go b/internal/gitaly/linguist/linguist_test.go index edc7a801c..24cacf81e 100644 --- a/internal/gitaly/linguist/linguist_test.go +++ b/internal/gitaly/linguist/linguist_test.go @@ -2,10 +2,12 @@ package linguist import ( "encoding/json" + "os" "path/filepath" "testing" "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v15/internal/git" "gitlab.com/gitlab-org/gitaly/v15/internal/git/gittest" "gitlab.com/gitlab-org/gitaly/v15/internal/gitaly/config" "gitlab.com/gitlab-org/gitaly/v15/internal/testhelper" @@ -16,17 +18,126 @@ func TestMain(m *testing.M) { testhelper.Run(m) } -func TestInstance_Stats_successful(t *testing.T) { +func TestInstance_Stats(t *testing.T) { ctx := testhelper.Context(t) + cfg := testcfg.Build(t) - cfg, _, repoPath := testcfg.BuildWithRepo(t) - - ling, err := New(cfg, gittest.NewCommandFactory(t, cfg)) + linguist, err := New(cfg, gittest.NewCommandFactory(t, cfg)) require.NoError(t, err) - counts, err := ling.Stats(ctx, repoPath, "1e292f8fedd741b75372e19097c76d327140c312") - require.NoError(t, err) - require.Equal(t, uint64(2943), counts["Ruby"]) + commitID := git.ObjectID("1e292f8fedd741b75372e19097c76d327140c312") + + for _, tc := range []struct { + desc string + setup func(t *testing.T) (string, git.ObjectID) + expectedStats ByteCountPerLanguage + expectedErr string + }{ + { + desc: "successful", + setup: func(t *testing.T) (string, git.ObjectID) { + _, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0]) + return repoPath, commitID + }, + expectedStats: map[string]uint64{ + "CoffeeScript": 107, + "HTML": 349, + "JavaScript": 1014, + "Ruby": 2943, + }, + }, + { + desc: "preexisting cache", + setup: func(t *testing.T) (string, git.ObjectID) { + _, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0]) + + // We simply run the linguist once before so that it can already + // write the cache. + _, err := linguist.Stats(ctx, repoPath, commitID.String()) + require.NoError(t, err) + require.FileExists(t, filepath.Join(repoPath, "language-stats.cache")) + + return repoPath, commitID + }, + expectedStats: map[string]uint64{ + "CoffeeScript": 107, + "HTML": 349, + "JavaScript": 1014, + "Ruby": 2943, + }, + }, + { + desc: "corrupted cache", + setup: func(t *testing.T) (string, git.ObjectID) { + _, repoPath := gittest.CloneRepo(t, cfg, cfg.Storages[0]) + + require.NoError(t, os.WriteFile(filepath.Join(repoPath, "language-stats.cache"), []byte("garbage"), 0o644)) + + return repoPath, commitID + }, + expectedStats: map[string]uint64{ + "CoffeeScript": 107, + "HTML": 349, + "JavaScript": 1014, + "Ruby": 2943, + }, + }, + { + desc: "old cache", + setup: func(t *testing.T) (string, git.ObjectID) { + _, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + + oldCommitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents(), gittest.WithTreeEntries( + gittest.TreeEntry{Path: "main.rb", Content: "require 'fileutils'", Mode: "100644"}, + )) + newCommitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithParents(oldCommitID), gittest.WithTreeEntries( + gittest.TreeEntry{Path: "main.go", Content: "package main", Mode: "100644"}, + )) + + // Precreate the cache with the old commit. This ensures that + // linguist knows to update the cache. + stats, err := linguist.Stats(ctx, repoPath, oldCommitID.String()) + require.NoError(t, err) + require.FileExists(t, filepath.Join(repoPath, "language-stats.cache")) + require.Equal(t, ByteCountPerLanguage{ + "Ruby": 19, + }, stats) + + return repoPath, newCommitID + }, + expectedStats: map[string]uint64{ + "Go": 12, + }, + }, + { + desc: "missing repository", + setup: func(t *testing.T) (string, git.ObjectID) { + return filepath.Join(testhelper.TempDir(t), "nonexistent"), commitID + }, + expectedErr: "waiting for linguist: exit status 1", + }, + { + desc: "missing commit", + setup: func(t *testing.T) (string, git.ObjectID) { + _, repoPath := gittest.InitRepo(t, cfg, cfg.Storages[0]) + return repoPath, commitID + }, + expectedErr: "waiting for linguist: exit status 1", + }, + } { + t.Run(tc.desc, func(t *testing.T) { + repoPath, objectID := tc.setup(t) + + stats, err := linguist.Stats(ctx, repoPath, objectID.String()) + if tc.expectedErr == "" { + require.NoError(t, err) + require.Equal(t, tc.expectedStats, stats) + require.FileExists(t, filepath.Join(repoPath, "language-stats.cache")) + } else { + require.EqualError(t, err, tc.expectedErr) + } + }) + } } func TestInstance_Stats_unmarshalJSONError(t *testing.T) { diff --git a/ruby/bin/gitaly-linguist b/ruby/bin/gitaly-linguist new file mode 100755 index 000000000..f06e29b35 --- /dev/null +++ b/ruby/bin/gitaly-linguist @@ -0,0 +1,86 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require 'json' +require 'linguist' +require 'optparse' +require 'rugged' +require 'tempfile' +require 'zlib' + +LANGUAGE_STATS_CACHE = 'language-stats.cache' +LANGUAGE_STATS_CACHE_VERSION = "v3:#{Linguist::VERSION}" + +def gitaly_linguist(args) + repository_path = nil + commit = nil + + parser = OptionParser.new do |opts| + opts.on("-rREPOSITORY", "--repository=REPOSITORY", "Repository to scan") { |r| repository_path = r } + opts.on("-cCOMMIT", "--commit=COMMIT", "Commit to scan") { |c| commit = c } + opts.on("-h", "--help", "Prints this help") do + puts opts + exit + end + end + + parser.parse!(args) + + raise OptionParser::MissingArgument, 'repository' if repository_path.nil? + raise OptionParser::MissingArgument, 'commit' if commit.nil? + + Rugged::Settings['search_path_system'] = '/dev/null' + Rugged::Settings['search_path_global'] = '/dev/null' + Rugged::Settings['search_path_xdg'] = '/dev/null' + + repository = Rugged::Repository.bare(repository_path) + project = Linguist::Repository.new(repository, commit) + + if (cache = load_cache(repository_path)) + old_commit_oid, old_stats = cache + + project.load_existing_stats(old_commit_oid, old_stats) + end + + puts JSON.dump(project.languages) + + write_cache(repository_path, commit, project.cache) +end + +def cache_file(repo_path) + File.join(repo_path, LANGUAGE_STATS_CACHE) +end + +def load_cache(repo_path) + cached_data = File.open(cache_file(repo_path), "rb") do |f| + Zlib::Inflate.inflate(f.read) + end + + # rubocop:disable Security/MarshalLoad + # + # While this is ugly, it's the same as we previously did in git-linguist. So + # for backwards-compatibility reasons we can't change this. + version, commit, stats = Marshal.load(cached_data) + # rubocop:enable Security/MarshalLoad + + if version == LANGUAGE_STATS_CACHE_VERSION && commit && stats + [commit, stats] + end +rescue SystemCallError, ::Zlib::DataError, ::Zlib::BufError, TypeError + nil +end + +def write_cache(repo_path, commit, stats) + cache = [LANGUAGE_STATS_CACHE_VERSION, commit, stats] + + Tempfile.open('cache_file', repo_path) do |f| + marshal = Marshal.dump(cache) + f.write(Zlib::Deflate.deflate(marshal)) + f.close + File.rename(f.path, cache_file(repo_path)) + end + + FileUtils.chmod 0o644, cache_file(repo_path) +end + +gitaly_linguist(ARGV) |