diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-02-22 13:44:23 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-02-24 16:43:30 +0300 |
commit | 22ab115f6e0fc17c24e7923e871402cae6be3de3 (patch) | |
tree | 2f35c2296bade6853ba82cefc639c680f5d277af | |
parent | f9603bfd2da6488628524000cdbb082afef5b921 (diff) |
blob: Implement function to find LFS pointers
We're about to port the implementation of the blob service's LFS-related
RPCs to Go. The most important building block for this is to be able to
identify LFS pointers, either by walking a revision graph or by reading
blob IDs directly.
This commit introduces two new functions which implement this with
`findLFSPointersByRevisions()` and `readLFSPointers()`. While the former
one searches the revision graph for LFS pointers, the latter one accepts
a list of potential LFS pointers and returns all which in fact are valid
LFS pointers.
Except for the newly added tests, both functions are not yet used.
-rw-r--r-- | internal/gitaly/service/blob/lfs_pointers.go | 159 | ||||
-rw-r--r-- | internal/gitaly/service/blob/lfs_pointers_test.go | 185 |
2 files changed, 334 insertions, 10 deletions
diff --git a/internal/gitaly/service/blob/lfs_pointers.go b/internal/gitaly/service/blob/lfs_pointers.go index 96ec4531f..517ae3d9f 100644 --- a/internal/gitaly/service/blob/lfs_pointers.go +++ b/internal/gitaly/service/blob/lfs_pointers.go @@ -1,10 +1,19 @@ package blob import ( + "bufio" + "bytes" + "context" + "errors" "fmt" + "io" + "io/ioutil" + "strings" - "gitlab.com/gitlab-org/gitaly/internal/errors" + gitaly_errors "gitlab.com/gitlab-org/gitaly/internal/errors" "gitlab.com/gitlab-org/gitaly/internal/git" + "gitlab.com/gitlab-org/gitaly/internal/git/catfile" + "gitlab.com/gitlab-org/gitaly/internal/git/localrepo" "gitlab.com/gitlab-org/gitaly/internal/gitaly/rubyserver" "gitlab.com/gitlab-org/gitaly/proto/go/gitalypb" "google.golang.org/grpc/codes" @@ -12,14 +21,14 @@ import ( ) const ( - // These limits are used as a heuristic to ignore files which can't be LFS - // pointers. The format of these is described in - // https://github.com/git-lfs/git-lfs/blob/master/docs/spec.md#the-pointer - - // LfsPointerMinSize is the minimum size for an lfs pointer text blob - LfsPointerMinSize = 120 - // LfsPointerMaxSize is the maximum size for an lfs pointer text blob - LfsPointerMaxSize = 200 + // lfsPointerMaxSize is the maximum size for an lfs pointer text blob. This limit is used + // as a heuristic to filter blobs which can't be LFS pointers. The format of these pointers + // is described in https://github.com/git-lfs/git-lfs/blob/master/docs/spec.md#the-pointer. + lfsPointerMaxSize = 200 +) + +var ( + errInvalidRevision = errors.New("invalid revision") ) type getLFSPointerByRevisionRequest interface { @@ -62,7 +71,7 @@ func (s *server) GetLFSPointers(req *gitalypb.GetLFSPointersRequest, stream gita func validateGetLFSPointersRequest(req *gitalypb.GetLFSPointersRequest) error { if req.GetRepository() == nil { - return errors.ErrEmptyRepository + return gitaly_errors.ErrEmptyRepository } if len(req.GetBlobIds()) == 0 { @@ -152,3 +161,133 @@ func validateGetAllLFSPointersRequest(in *gitalypb.GetAllLFSPointersRequest) err } return nil } + +// findLFSPointersByRevisions will return all LFS objects reachable via the given set of revisions. +// Revisions accept all syntax supported by git-rev-list(1). This function also accepts a set of +// options accepted by git-rev-list(1). Note that because git.Commands do not accept dashed +// positional arguments, it is currently not possible to mix options and revisions (e.g. "git +// rev-list master --not feature"). +func findLFSPointersByRevisions( + ctx context.Context, + repo *localrepo.Repo, + gitCmdFactory git.CommandFactory, + opts []git.Option, + revisions ...string, +) (_ []*gitalypb.LFSPointer, returnErr error) { + for _, revision := range revisions { + if strings.HasPrefix(revision, "-") && revision != "--all" && revision != "--not" { + return nil, fmt.Errorf("%w: %q", errInvalidRevision, revision) + } + } + + // git-rev-list(1) currently does not have any way to list all reachable objects of a + // certain type. As an optimization, we thus call it with `--object-names`, which will + // print an associated name for a given revision, if there is any. This allows us to skip + // at least some objects, namely all commits. + // + // It is questionable whether this optimization helps at all: by including object names, + // git cannot make use of bitmap indices. We thus pessimize git-rev-list(1) and optimize + // git-cat-file(1). And given that there's going to be a lot more blobs and trees (which + // both _do_ have an object name) than commits, it's probably an optimization which even + // slows down execution. Still, we keep this to stay compatible with the Ruby + // implementation. + var revListStderr bytes.Buffer + revlist, err := repo.Exec(ctx, nil, git.SubCmd{ + Name: "rev-list", + Flags: append([]git.Option{ + git.Flag{Name: "--in-commit-order"}, + git.Flag{Name: "--objects"}, + git.Flag{Name: "--object-names"}, + git.Flag{Name: fmt.Sprintf("--filter=blob:limit=%d", lfsPointerMaxSize)}, + }, opts...), + Args: revisions, + }, git.WithStderr(&revListStderr)) + if err != nil { + return nil, fmt.Errorf("could not execute rev-list: %w", err) + } + defer func() { + if err := revlist.Wait(); err != nil && returnErr == nil { + returnErr = fmt.Errorf("rev-list failed: %w, stderr: %q", + err, revListStderr.String()) + } + }() + + return readLFSPointers(ctx, repo, gitCmdFactory, revlist, true) +} + +// readLFSPointers reads object IDs of potential LFS pointers from the given reader and for each of +// them, it will determine whether the referenced object is an LFS pointer. Objects which are not a +// valid LFS pointer will be ignored. Objects which do not exist result in an error. +// +// If filterByObjectName is set to true, only IDs which have an associated object name will be +// read. This is helpful to pass output of git-rev-list(1) with `--object-names` directly to this +// function. +func readLFSPointers( + ctx context.Context, + repo *localrepo.Repo, + gitCmdFactory git.CommandFactory, + objectIDReader io.Reader, + filterByObjectName bool, +) ([]*gitalypb.LFSPointer, error) { + catfileBatch, err := catfile.New(ctx, gitCmdFactory, repo) + if err != nil { + return nil, fmt.Errorf("could not execute cat-file: %w", err) + } + + var lfsPointers []*gitalypb.LFSPointer + scanner := bufio.NewScanner(objectIDReader) + + for scanner.Scan() { + revision := git.Revision(scanner.Bytes()) + if filterByObjectName { + revAndPath := strings.SplitN(revision.String(), " ", 2) + if len(revAndPath) != 2 { + continue + } + revision = git.Revision(revAndPath[0]) + } + + objectInfo, err := catfileBatch.Info(ctx, revision) + if err != nil { + return nil, fmt.Errorf("could not get LFS pointer info: %w", err) + } + + if objectInfo.Type != "blob" { + continue + } + + blob, err := catfileBatch.Blob(ctx, revision) + if err != nil { + return nil, fmt.Errorf("could not retrieve LFS pointer: %w", err) + } + + data, err := ioutil.ReadAll(blob.Reader) + if err != nil { + return nil, fmt.Errorf("could not read LFS pointer: %w", err) + } + + if !isLFSPointer(data) { + continue + } + + lfsPointers = append(lfsPointers, &gitalypb.LFSPointer{ + Data: data, + Size: int64(len(data)), + Oid: revision.String(), + }) + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("scanning LFS pointers failed: %w", err) + } + + return lfsPointers, nil +} + +// isLFSPointer determines whether the given blob contents are an LFS pointer or not. +func isLFSPointer(data []byte) bool { + // TODO: this is incomplete as it does not recognize pre-release version of LFS blobs with + // the "https://hawser.github.com/spec/v1" version. For compatibility with the Ruby RPC, we + // leave this as-is for now though. + return bytes.HasPrefix(data, []byte("version https://git-lfs.github.com/spec")) +} diff --git a/internal/gitaly/service/blob/lfs_pointers_test.go b/internal/gitaly/service/blob/lfs_pointers_test.go index a51bdd9e1..413ab37f6 100644 --- a/internal/gitaly/service/blob/lfs_pointers_test.go +++ b/internal/gitaly/service/blob/lfs_pointers_test.go @@ -1,12 +1,16 @@ package blob import ( + "errors" + "fmt" "io" "os/exec" "strings" "testing" "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/internal/git" + "gitlab.com/gitlab-org/gitaly/internal/git/localrepo" "gitlab.com/gitlab-org/gitaly/internal/gitaly/config" "gitlab.com/gitlab-org/gitaly/internal/testhelper" "gitlab.com/gitlab-org/gitaly/proto/go/gitalypb" @@ -492,3 +496,184 @@ func refHasPtr(t *testing.T, repoPath, ref string, lfsPtr *gitalypb.LFSPointer) return strings.Contains(objects, lfsPtr.Oid) } + +func TestFindLFSPointersByRevisions(t *testing.T) { + gitCmdFactory := git.NewExecCommandFactory(config.Config) + + repoProto, _, cleanup := testhelper.NewTestRepo(t) + defer cleanup() + repo := localrepo.New(gitCmdFactory, repoProto, config.Config) + + ctx, cancel := testhelper.Context() + defer cancel() + + for _, tc := range []struct { + desc string + opts []git.Option + revs []string + expectedErr error + expectedLFSPointers []*gitalypb.LFSPointer + }{ + { + desc: "--all", + opts: []git.Option{ + git.Flag{Name: "--all"}, + }, + expectedLFSPointers: []*gitalypb.LFSPointer{ + lfsPointers[lfsPointer1], + lfsPointers[lfsPointer2], + lfsPointers[lfsPointer3], + lfsPointers[lfsPointer4], + lfsPointers[lfsPointer5], + lfsPointers[lfsPointer6], + }, + }, + { + desc: "--not --all", + opts: []git.Option{ + git.Flag{Name: "--not"}, + git.Flag{Name: "--all"}, + }, + }, + { + desc: "initial commit", + revs: []string{"1a0b36b3cdad1d2ee32457c102a8c0b7056fa863"}, + }, + { + desc: "master", + revs: []string{"master"}, + expectedLFSPointers: []*gitalypb.LFSPointer{ + lfsPointers[lfsPointer1], + }, + }, + { + desc: "multiple revisions", + revs: []string{"master", "moar-lfs-ptrs"}, + expectedLFSPointers: []*gitalypb.LFSPointer{ + lfsPointers[lfsPointer1], + lfsPointers[lfsPointer2], + lfsPointers[lfsPointer3], + }, + }, + { + desc: "invalid dashed option", + revs: []string{"master", "--foobar"}, + expectedErr: fmt.Errorf("invalid revision: \"--foobar\""), + }, + { + desc: "invalid revision", + revs: []string{"does-not-exist"}, + expectedErr: fmt.Errorf("fatal: ambiguous argument 'does-not-exist'"), + }, + } { + t.Run(tc.desc, func(t *testing.T) { + actualLFSPointers, err := findLFSPointersByRevisions( + ctx, repo, gitCmdFactory, tc.opts, tc.revs...) + if tc.expectedErr == nil { + require.NoError(t, err) + } else { + require.Contains(t, err.Error(), tc.expectedErr.Error()) + } + require.ElementsMatch(t, tc.expectedLFSPointers, actualLFSPointers) + }) + } +} + +func TestReadLFSPointers(t *testing.T) { + gitCmdFactory := git.NewExecCommandFactory(config.Config) + + repoProto, _, cleanup := testhelper.NewTestRepo(t) + defer cleanup() + repo := localrepo.New(gitCmdFactory, repoProto, config.Config) + + ctx, cancel := testhelper.Context() + defer cancel() + + for _, tc := range []struct { + desc string + input string + filterByObjectName bool + expectedErr error + expectedLFSPointers []*gitalypb.LFSPointer + }{ + { + desc: "single object ID", + input: strings.Join([]string{lfsPointer1}, "\n"), + expectedLFSPointers: []*gitalypb.LFSPointer{ + lfsPointers[lfsPointer1], + }, + }, + { + desc: "multiple object IDs", + input: strings.Join([]string{ + lfsPointer1, + lfsPointer2, + lfsPointer3, + lfsPointer4, + lfsPointer5, + lfsPointer6, + }, "\n"), + expectedLFSPointers: []*gitalypb.LFSPointer{ + lfsPointers[lfsPointer1], + lfsPointers[lfsPointer2], + lfsPointers[lfsPointer3], + lfsPointers[lfsPointer4], + lfsPointers[lfsPointer5], + lfsPointers[lfsPointer6], + }, + }, + { + desc: "multiple object IDs with name filter", + input: strings.Join([]string{ + lfsPointer1, + lfsPointer2, + lfsPointer3 + " x", + lfsPointer4, + lfsPointer5 + " z", + lfsPointer6 + " a", + }, "\n"), + filterByObjectName: true, + expectedLFSPointers: []*gitalypb.LFSPointer{ + lfsPointers[lfsPointer3], + lfsPointers[lfsPointer5], + lfsPointers[lfsPointer6], + }, + }, + { + desc: "non-pointer object", + input: strings.Join([]string{ + "60ecb67744cb56576c30214ff52294f8ce2def98", + }, "\n"), + }, + { + desc: "mixed objects", + input: strings.Join([]string{ + "60ecb67744cb56576c30214ff52294f8ce2def98", + lfsPointer2, + }, "\n"), + expectedLFSPointers: []*gitalypb.LFSPointer{ + lfsPointers[lfsPointer2], + }, + }, + { + desc: "missing object", + input: strings.Join([]string{ + "0101010101010101010101010101010101010101", + }, "\n"), + expectedErr: errors.New("object not found"), + }, + } { + t.Run(tc.desc, func(t *testing.T) { + reader := strings.NewReader(tc.input) + + actualLFSPointers, err := readLFSPointers( + ctx, repo, gitCmdFactory, reader, tc.filterByObjectName) + if tc.expectedErr == nil { + require.NoError(t, err) + } else { + require.Contains(t, err.Error(), tc.expectedErr.Error()) + } + require.ElementsMatch(t, tc.expectedLFSPointers, actualLFSPointers) + }) + } +} |