diff options
author | Sami Hiltunen <shiltunen@gitlab.com> | 2024-01-12 19:15:13 +0300 |
---|---|---|
committer | Sami Hiltunen <shiltunen@gitlab.com> | 2024-01-15 13:53:35 +0300 |
commit | da43db2215deee24f74eb19c7a81749aa5b7663f (patch) | |
tree | e8551aadcc1d3b4f59a13dfea5ebcc417990a087 | |
parent | 9be461c8617c48d3a86069250aba5dca0bc6971f (diff) |
Add WalkObjects to walk the object graph
The TransactionManager needs to walk objects in the quarantine to
identify which objects to commit with the transaction in a packfile
and which objects that already exist in the repository the transaction
depends on.
This commit adds WalkObjects to facilitate that. It walks the object
graph from given start points and prints out all objects IDs that it
encounters. If applicable, the objects path is also printed out to aid
packing them later.
Objects that are encountered but do not exist are printed out with a '?'
to mark them missing. When the WalkObjects is invoked with just the
quarantine directory without the alternates, this leads to the object walk
identifying the objects from the quarantine that we need to pack with the
missing objects being the transaction's dependencies that we must exist in
the repository already.
'git rev-list --missing=print' has an issue where it errors out if a start
point of a walk does not exist. This is a problem as the heads we feed as
the starting point of the walk may be objects that already exist in the
repository. In that case, they should be treated as the transaction's
dependencies and simply be printed out as missing objects. There is a
workaround in place to do so by checking whether the object exists before
feeding it to the walk. If it doesn't, we simply print it out as missing
after the walk. This allows us to plug in the functionality in
TransactionManager and later just remove the workaround from here once the
issue has been fixed in Git.
-rw-r--r-- | internal/git/localrepo/objects.go | 66 | ||||
-rw-r--r-- | internal/git/localrepo/objects_test.go | 208 |
2 files changed, 274 insertions, 0 deletions
diff --git a/internal/git/localrepo/objects.go b/internal/git/localrepo/objects.go index 4239f5218..3ac8f3ae5 100644 --- a/internal/git/localrepo/objects.go +++ b/internal/git/localrepo/objects.go @@ -1,6 +1,7 @@ package localrepo import ( + "bufio" "bytes" "context" "errors" @@ -148,6 +149,71 @@ func (repo *Repo) WalkUnreachableObjects(ctx context.Context, heads io.Reader, o return nil } +// WalkObjects walks the object graph starting from heads and writes to the output the objects it +// sees. Objects that are encountered during the walk and exist in the object database are written +// out as `<oid> <path>\n` with the `<path>` being sometimes present if applicable. Objects that +// were encountered during the walk but do not exist in the object database are written out +// as `?<oid>\n`. +func (repo *Repo) WalkObjects(ctx context.Context, heads io.Reader, output io.Writer) error { + // TransactionManager feeds here the reference tips updated in transaction. Some of the + // objects may exist in the repository and some may be new. `rev-list ---missing` currently + // fails if a starting point of the walk does not exist. This is a problem for us as this means + // the walk fails when updating a reference to an existing object in the repository. + // + // Below is a workaround for this problem until the actual issue in rev-list is fixed. + // Before we pass the heads to rev-list, we check whether they exist in the repo. If they + // do, we pass them to rev-list for the walk. If they do not, they are marked as missing + // object and appended to the output after the rev-list call has finished. + // + // Issue: https://gitlab.com/gitlab-org/git/-/issues/234 + var existingHeads []byte + var missingHeads []byte + + scanner := bufio.NewScanner(heads) + for scanner.Scan() { + objectID := scanner.Text() + + if _, err := repo.ReadObjectInfo(ctx, git.Revision(objectID)); err != nil { + var invalidObjectError InvalidObjectError + if errors.As(err, &invalidObjectError) { + missingHeads = append(missingHeads, []byte("?"+objectID+"\n")...) + continue + } + + return fmt.Errorf("read object info: %w", err) + } + + existingHeads = append(existingHeads, []byte(objectID+"\n")...) + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("scanner: %w", err) + } + + var stderr bytes.Buffer + if err := repo.ExecAndWait(ctx, + git.Command{ + Name: "rev-list", + Flags: []git.Option{ + git.Flag{Name: "--objects"}, + git.Flag{Name: "--missing=print"}, + git.Flag{Name: "--stdin"}, + }, + }, + git.WithStdin(bytes.NewReader(existingHeads)), + git.WithStdout(output), + git.WithStderr(&stderr), + ); err != nil { + return structerr.New("rev-list: %w", err).WithMetadata("stderr", stderr.String()) + } + + if _, err := output.Write(missingHeads); err != nil { + return fmt.Errorf("write missing heads: %w", err) + } + + return nil +} + // PackObjects takes in object IDs separated by newlines. It packs the objects into a pack file and // writes it into the output. func (repo *Repo) PackObjects(ctx context.Context, objectIDs io.Reader, output io.Writer) error { diff --git a/internal/git/localrepo/objects_test.go b/internal/git/localrepo/objects_test.go index eb59b0926..58049bfca 100644 --- a/internal/git/localrepo/objects_test.go +++ b/internal/git/localrepo/objects_test.go @@ -2,6 +2,8 @@ package localrepo import ( "bytes" + "os" + "path/filepath" "strings" "testing" @@ -211,6 +213,212 @@ func TestWalkUnreachableObjects(t *testing.T) { } } +func TestWalkObjects(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + + cfg, repo, repoPath := setupRepo(t) + + // This blob is not expected to be encountered during walk as the tree referencing it is missing. + unexpectedBlob := gittest.WriteBlob(t, cfg, repoPath, []byte("we should not see this blob during the walk")) + // Create a commit which has a tree missing from the object database. We expect the commit to be reported with the tree reported as missing. + missingTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{{ + OID: unexpectedBlob, + Mode: "100644", + Path: "this tree will be deleted so this path should not be output", + }}) + rootCommit1 := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTree(missingTree)) + + // Create a commit with a blob missing from the tree. We expect the commit and tree be reported with the blob reported as missing. + missingBlob := gittest.WriteBlob(t, cfg, repoPath, []byte("pruned blob")) + rootCommit2Tree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{{ + OID: missingBlob, + Mode: "100644", + Path: "missing_blob", + }}) + rootCommit2 := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTree(rootCommit2Tree)) + + // This commit is deleted and is expected to be reported as missing when it's encountered during the walk. + missingRootCommit := gittest.WriteCommit(t, cfg, repoPath) + + // Create a commit with two parents. We expect both parents to be included. + mergeCommit := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTree(gittest.DefaultObjectHash.EmptyTreeOID), gittest.WithParents(rootCommit1, rootCommit2, missingRootCommit)) + + // This commit is missing its only parent. Walking this we expect the commit to be reported but the parent reported as missing. + commitMissingParent := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTree(gittest.DefaultObjectHash.EmptyTreeOID), gittest.WithParents(missingRootCommit)) + + // This tag is referencing a missing commit. We expect the tag to be reported and the commit reported as missing. + tagMissingParent := gittest.WriteTag(t, cfg, repoPath, "refs/tags/tag-missing-parent", missingRootCommit.Revision(), gittest.WriteTagConfig{ + Message: "tag missing parent", + }) + + // Create a hierarchy with some files and subdirectories to see they are reported expectedly. + blob1 := gittest.WriteBlob(t, cfg, repoPath, []byte("blob 1")) + blob2 := gittest.WriteBlob(t, cfg, repoPath, []byte("blob 2")) + blob3 := gittest.WriteBlob(t, cfg, repoPath, []byte("blob 3")) + leafCommitSubtree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + { + OID: blob2, + Mode: "100644", + Path: "blob2", + }, + { + OID: blob3, + Mode: "100644", + Path: "blob3", + }, + }) + leafCommitTree := gittest.WriteTree(t, cfg, repoPath, []gittest.TreeEntry{ + { + OID: leafCommitSubtree, + Mode: "040000", + Path: "subtree", + }, + { + OID: blob1, + Mode: "100644", + Path: "blob1", + }, + }) + + // Create two commits that diverge from the same parent. When walking one, we don't expect to get the other one reported. + leafCommit := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTree(leafCommitTree), gittest.WithParents(mergeCommit)) + divergedCommit := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTree(gittest.DefaultObjectHash.EmptyTreeOID), gittest.WithParents(mergeCommit)) + divergedTag := gittest.WriteTag(t, cfg, repoPath, "refs/tags/diverged-tag", divergedCommit.Revision(), gittest.WriteTagConfig{ + Message: "diverged tag", + }) + + // Remove the objects that we use for testing missing objects. + for _, removedOID := range []string{ + missingBlob.String(), + missingTree.String(), + missingRootCommit.String(), + } { + require.NoError(t, os.Remove(filepath.Join(repoPath, "objects", removedOID[:2], removedOID[2:]))) + } + + for _, tc := range []struct { + desc string + heads []git.ObjectID + expectedOutput []string + expectedErrorMessage string + }{ + { + desc: "no heads", + }, + { + desc: "missing start points", + heads: []git.ObjectID{ + missingBlob, + missingRootCommit, + }, + expectedOutput: []string{ + "?" + missingBlob.String(), + "?" + missingRootCommit.String(), + }, + }, + { + desc: "commit missing parent commit", + heads: []git.ObjectID{ + commitMissingParent, + }, + expectedOutput: []string{ + commitMissingParent.String(), + gittest.DefaultObjectHash.EmptyTreeOID.String() + " ", + "?" + missingRootCommit.String(), + }, + }, + { + desc: "annotated tag missing referenced commit", + heads: []git.ObjectID{ + tagMissingParent, + }, + // This is the actual expected output. Due to an issue in Git, rev-list fails when it encounters + // the missing commit even if `--missing=print` is used. + // + // Issue: https://gitlab.com/gitlab-org/git/-/issues/239 + // + /*expectedOutput: []string{ + tagMissingParent.String(), + "?"+ missingRootCommit.String(), + },*/ + expectedErrorMessage: "rev-list: exit status 128", + }, + { + desc: "diverged tag and commit are not reported", + heads: []git.ObjectID{ + leafCommit, + }, + expectedOutput: []string{ + leafCommit.String(), + leafCommitTree.String() + " ", + blob1.String() + " blob1", + leafCommitSubtree.String() + " subtree", + blob2.String() + " subtree/blob2", + blob3.String() + " subtree/blob3", + mergeCommit.String(), + gittest.DefaultObjectHash.EmptyTreeOID.String() + " ", + "?" + missingRootCommit.String(), + rootCommit2.String(), + rootCommit2Tree.String() + " ", + "?" + missingBlob.String(), + rootCommit1.String(), + "?" + missingTree.String(), + }, + }, + { + desc: "walk the leaf commit and diverged tag", + heads: []git.ObjectID{ + leafCommit, + divergedTag, + }, + expectedOutput: []string{ + divergedTag.String() + " refs/tags/diverged-tag", + divergedCommit.String(), + leafCommit.String(), + leafCommitTree.String() + " ", + blob1.String() + " blob1", + leafCommitSubtree.String() + " subtree", + blob2.String() + " subtree/blob2", + blob3.String() + " subtree/blob3", + mergeCommit.String(), + gittest.DefaultObjectHash.EmptyTreeOID.String() + " ", + "?" + missingRootCommit.String(), + rootCommit2.String(), + rootCommit2Tree.String() + " ", + "?" + missingBlob.String(), + rootCommit1.String(), + "?" + missingTree.String(), + }, + }, + } { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + t.Parallel() + + var heads []string + for _, head := range tc.heads { + heads = append(heads, head.String()) + } + + var output bytes.Buffer + err := repo.WalkObjects(ctx, strings.NewReader(strings.Join(heads, "\n")), &output) + if tc.expectedErrorMessage != "" { + require.Equal(t, tc.expectedErrorMessage, err.Error()) + return + } + require.NoError(t, err) + + var actualOutput []string + if output.Len() > 0 { + actualOutput = strings.Split(strings.TrimSpace(output.String()), "\n") + } + require.ElementsMatch(t, tc.expectedOutput, actualOutput) + }) + } +} + func TestPackAndUnpackObjects(t *testing.T) { t.Parallel() |