diff options
author | Will Chandler <wchandler@gitlab.com> | 2023-09-05 23:57:29 +0300 |
---|---|---|
committer | Will Chandler <wchandler@gitlab.com> | 2023-09-05 23:57:29 +0300 |
commit | 2d2653afc63b49a40cbee4dc2d5e4c2c700c1abe (patch) | |
tree | 16106dabfa4b665095d3cd499718764ca688d1f8 | |
parent | b9064e00593e573847ca1c6bb28f73b1b9331af0 (diff) | |
parent | 6a42099a2b2da4cd827daa2ffdb52e23a668a28b (diff) |
Merge branch 'ps-go-based-tar-writer' into 'master'
archive: Create tarball with archive/tar package
See merge request https://gitlab.com/gitlab-org/gitaly/-/merge_requests/6080
Merged-by: Will Chandler <wchandler@gitlab.com>
Approved-by: Patrick Steinhardt <psteinhardt@gitlab.com>
Approved-by: Will Chandler <wchandler@gitlab.com>
Reviewed-by: Patrick Steinhardt <psteinhardt@gitlab.com>
Reviewed-by: Pavlo Strokov <pstrokov@gitlab.com>
Reviewed-by: Pavlo Strokov <pavlo.strokov@refurbed.com>
Co-authored-by: Strokov Pavlo <strokovpavelsergeevich@gmail.com>
-rw-r--r-- | cmd/gitaly-hooks/hooks_test.go | 7 | ||||
-rw-r--r-- | internal/archive/archive.go | 238 | ||||
-rw-r--r-- | internal/archive/archive_test.go | 93 | ||||
-rw-r--r-- | internal/archive/testhelper_test.go | 11 | ||||
-rw-r--r-- | internal/git/gittest/command.go | 18 |
5 files changed, 359 insertions, 8 deletions
diff --git a/cmd/gitaly-hooks/hooks_test.go b/cmd/gitaly-hooks/hooks_test.go index 934f1a3ce..25cd7937f 100644 --- a/cmd/gitaly-hooks/hooks_test.go +++ b/cmd/gitaly-hooks/hooks_test.go @@ -826,9 +826,10 @@ remote: error executing git hook var stderr, stdout bytes.Buffer gittest.ExecOpts(t, cfg, gittest.ExecConfig{ - Env: envForHooks(t, ctx, cfg, repo, glHookValues{GitalyLogDir: logDir}, proxyValues{}), - Stdout: &stdout, - Stderr: &stderr, + Env: envForHooks(t, ctx, cfg, repo, glHookValues{GitalyLogDir: logDir}, proxyValues{}), + Stdout: &stdout, + Stderr: &stderr, + ExpectedExitCode: 128, }, args...) require.Equal(t, "", stdout.String()) diff --git a/internal/archive/archive.go b/internal/archive/archive.go index 20735edfe..4667f9ae0 100644 --- a/internal/archive/archive.go +++ b/internal/archive/archive.go @@ -1,14 +1,43 @@ package archive import ( + "archive/tar" "context" + "errors" "fmt" "io" + "io/fs" + "os" + "path/filepath" "runtime" + "strings" "gitlab.com/gitlab-org/gitaly/v16/internal/command" + "golang.org/x/exp/slices" ) +const ( + // readDirEntriesPageSize is an amount of fs.DirEntry(s) to read + // from the opened file descriptor of the directory. + readDirEntriesPageSize = 32 + + // Below is a set of flags used to decide what to do with the file/directory + // found on the disk. + // decisionWrite means that the file/directory needs to be added to the archive + decisionWrite = iota + // decisionSkip means that the file/directory shouldn't be added to the archive + decisionSkip + // decisionStop means that nothing else left to be added to the archive + decisionStop + + // tarFormat is a format to use for the archived files/directories/etc. + // The decision is made to use it as it is the latest version of the format. + // It allows sparse files, long file names, etc. that older formats doesn't support. + tarFormat = tar.FormatPAX +) + +type decider func(string) int + // WriteTarball writes a tarball to an `io.Writer` for the provided path // containing the specified archive members. Members should be specified // relative to `path`. @@ -32,3 +61,212 @@ func WriteTarball(ctx context.Context, writer io.Writer, path string, members .. return nil } + +// writeTarball creates a tar archive by flushing data into writer. +// Files and folders to be included into archive needs to be provided as members. +// If entry from members slice points to a folder all content inside it will be archived. +// The empty folders will be archived as well. +// It doesn't support sparse files efficiently, see https://github.com/golang/go/issues/22735 +func writeTarball(ctx context.Context, writer io.Writer, path string, members ...string) error { + // The function decides what to do with the entry: write, skip or finish archive + // creation if all members are added. + decide := func(candidate string) int { + if len(members) == 0 { + return decisionStop + } + + idx := slices.Index(members, candidate) + if idx < 0 { + return decisionSkip + } + + members = slices.Delete(members, idx, idx+1) + return decisionWrite + } + + archWriter := tar.NewWriter(writer) + if err := walkDir(ctx, archWriter, path, "", decide); err != nil { + return fmt.Errorf("walk dir: %w", err) + } + + if err := archWriter.Close(); err != nil { + return fmt.Errorf("closing archive writer: %w", err) + } + + return nil +} + +func walkDir(ctx context.Context, archWriter *tar.Writer, path, currentPrefix string, decide decider) error { + if cancelled(ctx) { + return ctx.Err() + } + + dir, err := os.Open(path) + if err != nil { + return fmt.Errorf("open dir: %w", err) + } + defer func() { _ = dir.Close() }() + + dirStat, err := dir.Stat() + if err != nil { + return fmt.Errorf("dir description: %w", err) + } + + if dirStat.Mode().Type() != fs.ModeDir { + return errors.New("is not a dir") + } + + for { + // Own implementation of the directory walker is used for optimization. + // fs.WalkDir() reads all the entries into memory at once and does the sorting. + // If there are a lot of files in the nested directories all that info will remain + // in the memory while the leaf is reached. In other words if there is 1000 files + // in the root and 1000 file in the nested directory of the root in one moment + // 2000 entries will remain in the memory for no reason. + dirEntries, err := dir.ReadDir(readDirEntriesPageSize) + if err != nil { + if errors.Is(err, io.EOF) { + return nil + } + + return fmt.Errorf("chunked dir read: %w", err) + } + + for _, dirEntry := range dirEntries { + if cancelled(ctx) { + return ctx.Err() + } + + entryName := dirEntry.Name() + entryRelativePath := filepath.Join(currentPrefix, entryName) + + decision := decide(entryRelativePath) + switch decision { + case decisionStop: + // Nothing needs to be added to the archive. + return nil + case decisionSkip: + // Current entry doesn't need to be added, but we need travers it if it is a directory. + if dirEntry.IsDir() { + if err := walkDir(ctx, archWriter, filepath.Join(path, entryName), entryRelativePath, decide); err != nil { + return fmt.Errorf("walk dir: %w", err) + } + } + case decisionWrite: + if dirEntry.IsDir() { + // As this is a dir we add all its content to the archive. + if err := tarDir(ctx, archWriter, filepath.Join(path, entryName), entryRelativePath, func(string) int { return decisionWrite }); err != nil { + return fmt.Errorf("write dir: %w", err) + } + continue + } + + if err := tarFile(archWriter, path, entryRelativePath); err != nil { + return fmt.Errorf("write file: %w", err) + } + default: + return fmt.Errorf("unhandled decision: %d", decision) + } + } + } +} + +func tarDir(ctx context.Context, archWriter *tar.Writer, path, currentPrefix string, decide decider) error { + // The current entry is a directory that needs to be added to the archive. + stat, err := os.Lstat(path) + if err != nil { + return err + } + + link := "" + if stat.Mode()&fs.ModeSymlink != 0 { + link, err = os.Readlink(path) + if err != nil { + return fmt.Errorf("read link destination: %w", err) + } + } + + header, err := tar.FileInfoHeader(stat, link) + if err != nil { + return fmt.Errorf("file info header: %w", err) + } + header.Name = strings.TrimSuffix(currentPrefix, "/") + "/" + header.Format = tarFormat + + if err := archWriter.WriteHeader(header); err != nil { + return fmt.Errorf("write file info header: %w", err) + } + + if err := walkDir(ctx, archWriter, path, currentPrefix, decide); err != nil { + return fmt.Errorf("walk dir: %w", err) + } + + return nil +} + +func tarFile(archWriter *tar.Writer, path, entryRelativePath string) error { + entryPath := filepath.Join(path, filepath.Base(entryRelativePath)) + + entryStat, err := os.Lstat(entryPath) + if err != nil { + return fmt.Errorf("file/link description: %w", err) + } + + if entryStat.Mode()&fs.ModeSymlink != 0 { + targetPath, err := os.Readlink(entryPath) + if err != nil { + return fmt.Errorf("read link destination: %w", err) + } + + header := &tar.Header{ + Typeflag: tar.TypeSymlink, + Name: entryRelativePath, + Linkname: targetPath, + Format: tarFormat, + Mode: int64(entryStat.Mode().Perm()), + } + + if err := archWriter.WriteHeader(header); err != nil { + return fmt.Errorf("write symlink file info header: %w", err) + } + + return nil + } + + curFile, err := os.Open(entryPath) + if err != nil { + return fmt.Errorf("open file: %w", err) + } + defer func() { _ = curFile.Close() }() + + curFileStat, err := curFile.Stat() + if err != nil { + return fmt.Errorf("file description: %w", err) + } + + header, err := tar.FileInfoHeader(curFileStat, curFileStat.Name()) + if err != nil { + return fmt.Errorf("file info header: %w", err) + } + header.Name = entryRelativePath + header.Format = tarFormat + + if err := archWriter.WriteHeader(header); err != nil { + return fmt.Errorf("write file info header: %w", err) + } + + if _, err := io.Copy(archWriter, curFile); err != nil { + return fmt.Errorf("copy file: %w", err) + } + + return nil +} + +func cancelled(ctx context.Context) bool { + select { + case <-ctx.Done(): + return true + default: + return false + } +} diff --git a/internal/archive/archive_test.go b/internal/archive/archive_test.go new file mode 100644 index 000000000..39c2abbed --- /dev/null +++ b/internal/archive/archive_test.go @@ -0,0 +1,93 @@ +package archive + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v16/internal/git/gittest" + "gitlab.com/gitlab-org/gitaly/v16/internal/helper/perm" + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper/testcfg" +) + +func TestWriteTarball(t *testing.T) { + t.Parallel() + + ctx := testhelper.Context(t) + tempDir := testhelper.TempDir(t) + srcDir := filepath.Join(tempDir, "src") + + // regular file + writeFile(t, filepath.Join(srcDir, "a.txt"), []byte("a")) + // empty dir + require.NoError(t, os.Mkdir(filepath.Join(srcDir, "empty_dir"), perm.PublicDir)) + // file with long name + writeFile(t, filepath.Join(srcDir, strings.Repeat("b", 150)+".txt"), []byte("b")) + // regular file that is not expected to be part of the archive (not in the members list) + writeFile(t, filepath.Join(srcDir, "excluded.txt"), []byte("excluded")) + // folder with multiple files all expected to be archived + nestedPath := filepath.Join(srcDir, "nested1") + for i := 0; i < readDirEntriesPageSize+1; i++ { + writeFile(t, filepath.Join(nestedPath, fmt.Sprintf("%d.txt", i)), []byte{byte(i)}) + } + // nested file that is not expected to be part of the archive + writeFile(t, filepath.Join(srcDir, "nested2/nested/nested/c.txt"), []byte("c")) + // deeply nested file + writeFile(t, filepath.Join(srcDir, "nested2/nested/nested/nested/nested/d.txt"), []byte("d")) + // file that is used to create a symbolic link, is not expected to be part of the archive + writeFile(t, filepath.Join(srcDir, "nested3/target.txt"), []byte("target")) + // link to the file above + require.NoError(t, os.Symlink(filepath.Join(srcDir, "nested3/target.txt"), filepath.Join(srcDir, "link.to.target.txt"))) + // directory that is a target of the symlink should not be archived + writeFile(t, filepath.Join(srcDir, "nested4/stub.txt"), []byte("symlinked")) + // link to the folder above + require.NoError(t, os.Symlink(filepath.Join(srcDir, "nested4"), filepath.Join(srcDir, "link.to.nested4"))) + + tarPath := filepath.Join(tempDir, "out.tar") + archFile, err := os.Create(tarPath) + require.NoError(t, err) + defer func() { _ = archFile.Close() }() + + err = writeTarball( + ctx, + archFile, + srcDir, + "a.txt", + strings.Repeat("b", 150)+".txt", + "nested1", + "nested2/nested/nested/nested/nested/d.txt", + "link.to.target.txt", + "link.to.nested4", + ) + require.NoError(t, err) + require.NoError(t, archFile.Close()) + + cfg := testcfg.Build(t) + + dstDir := filepath.Join(tempDir, "dst") + require.NoError(t, os.Mkdir(dstDir, perm.PublicDir)) + output, err := exec.Command("tar", "-xf", tarPath, "-C", dstDir).CombinedOutput() + require.NoErrorf(t, err, "%s", output) + diff := gittest.ExecOpts(t, cfg, gittest.ExecConfig{ExpectedExitCode: 1}, "diff", "--no-index", "--name-only", "--exit-code", dstDir, srcDir) + expected := strings.Join( + []string{ + filepath.Join(srcDir, "excluded.txt"), + filepath.Join(srcDir, "nested2/nested/nested/c.txt"), + filepath.Join(srcDir, "nested3/target.txt"), + filepath.Join(srcDir, "nested4/stub.txt\n"), + }, + "\n", + ) + require.Equal(t, expected, string(diff)) +} + +func writeFile(tb testing.TB, path string, data []byte) { + tb.Helper() + require.NoError(tb, os.MkdirAll(filepath.Dir(path), perm.PrivateDir)) + require.NoError(tb, os.WriteFile(path, data, perm.PrivateFile)) +} diff --git a/internal/archive/testhelper_test.go b/internal/archive/testhelper_test.go new file mode 100644 index 000000000..b8f9015cb --- /dev/null +++ b/internal/archive/testhelper_test.go @@ -0,0 +1,11 @@ +package archive + +import ( + "testing" + + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" +) + +func TestMain(m *testing.M) { + testhelper.Run(m) +} diff --git a/internal/git/gittest/command.go b/internal/git/gittest/command.go index 7fd44151b..862c7822b 100644 --- a/internal/git/gittest/command.go +++ b/internal/git/gittest/command.go @@ -26,6 +26,9 @@ type ExecConfig struct { // Env contains environment variables that should be appended to the spawned command's // environment. Env []string + // ExpectedExitCode is used to check the resulting exit code of the command. This can be used in case a command + // is expected to return an error code. + ExpectedExitCode int } // Exec runs a git command and returns the standard output, or fails. @@ -59,13 +62,18 @@ func ExecOpts(tb testing.TB, cfg config.Cfg, execCfg ExecConfig, args ...string) } func handleExecErr(tb testing.TB, cfg config.Cfg, execCfg ExecConfig, args []string, err error) { - if execCfg.Stderr == nil { - tb.Log(cfg.Git.BinPath, args) - if ee, ok := err.(*exec.ExitError); ok { - tb.Logf("%s\n", ee.Stderr) + var stderr []byte + if ee, ok := err.(*exec.ExitError); ok { + if execCfg.ExpectedExitCode == ee.ExitCode() { + return } - tb.Fatal(err) + stderr = ee.Stderr } + tb.Log(cfg.Git.BinPath, args) + if len(stderr) > 0 { + tb.Logf("%s\n", stderr) + } + tb.Fatal(err) } // NewCommand creates a new Git command ready for execution. |