From e0dc0e38b1ca181caa2e85df7561da36aef8054e Mon Sep 17 00:00:00 2001 From: Jaime Martinez Date: Wed, 9 Sep 2020 16:23:02 +1000 Subject: Add zip package that implements vfs.Root Adds a zip package capable of reading zip files from an httprange.Resource. It reads archive file contents into memory per archive that is requested from the vfs.Root. WIP: add simple test for archive reader WIP: fix build WIP: update archive test WIP: adding tests for archive WIP: print more info WIP: update zip file correct symlink WIP: use correct file Add bad symlink file to zip Update handling long symlinks update documentation and reorg code fix up stuff --- internal/httprange/http_reader.go | 4 + internal/serving/disk/reader.go | 11 +- internal/vfs/file.go | 7 +- internal/vfs/zip/archive.go | 183 +++++++++++++++++++++++++++++++ internal/vfs/zip/archive_test.go | 213 +++++++++++++++++++++++++++++++++++++ internal/vfs/zip/deflate_reader.go | 27 +++++ 6 files changed, 442 insertions(+), 3 deletions(-) create mode 100644 internal/vfs/zip/archive.go create mode 100644 internal/vfs/zip/archive_test.go create mode 100644 internal/vfs/zip/deflate_reader.go (limited to 'internal') diff --git a/internal/httprange/http_reader.go b/internal/httprange/http_reader.go index 474b589f..4e7db6bc 100644 --- a/internal/httprange/http_reader.go +++ b/internal/httprange/http_reader.go @@ -8,6 +8,7 @@ import ( "time" "gitlab.com/gitlab-org/gitlab-pages/internal/httptransport" + "gitlab.com/gitlab-org/gitlab-pages/internal/vfs" "gitlab.com/gitlab-org/gitlab-pages/metrics" ) @@ -41,6 +42,9 @@ type Reader struct { offset int64 } +// ensure that Reader is seekable +var _ vfs.SeekableFile = &Reader{} + // TODO: make this configurable/take an http client when creating a reader/ranged reader // instead https://gitlab.com/gitlab-org/gitlab-pages/-/issues/457 var httpClient = &http.Client{ diff --git a/internal/serving/disk/reader.go b/internal/serving/disk/reader.go index 350d036b..a0c0ed0d 100644 --- a/internal/serving/disk/reader.go +++ b/internal/serving/disk/reader.go @@ -194,10 +194,17 @@ func (reader *Reader) serveFile(ctx context.Context, w http.ResponseWriter, r *h return err } + w.Header().Set("Content-Type", contentType) + reader.fileSizeMetric.Observe(float64(fi.Size())) - w.Header().Set("Content-Type", contentType) - http.ServeContent(w, r, origPath, fi.ModTime(), file) + // Support SeekableFile if available + if rs, ok := file.(vfs.SeekableFile); ok { + http.ServeContent(w, r, origPath, fi.ModTime(), rs) + } else { + w.Header().Set("Content-Length", strconv.FormatInt(fi.Size(), 10)) + io.Copy(w, file) + } return nil } diff --git a/internal/vfs/file.go b/internal/vfs/file.go index 5260c847..47f68308 100644 --- a/internal/vfs/file.go +++ b/internal/vfs/file.go @@ -5,6 +5,11 @@ import "io" // File represents an open file, which will typically be the response body of a Pages request. type File interface { io.Reader - io.Seeker io.Closer } + +// SeekableFile represents a seekable file, which will typically be the response body of a Pages request. +type SeekableFile interface { + File + io.Seeker +} diff --git a/internal/vfs/zip/archive.go b/internal/vfs/zip/archive.go new file mode 100644 index 00000000..1d64e148 --- /dev/null +++ b/internal/vfs/zip/archive.go @@ -0,0 +1,183 @@ +package zip + +import ( + "archive/zip" + "context" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "sync" + + log "github.com/sirupsen/logrus" + + "gitlab.com/gitlab-org/gitlab-pages/internal/httprange" + "gitlab.com/gitlab-org/gitlab-pages/internal/vfs" +) + +const ( + dirPrefix = "public/" + maxSymlinkSize = 256 +) + +var ( + errNotSymlink = errors.New("not a symlink") + errSymlinkSize = errors.New("symlink too long") +) + +// zipArchive implements the vfs.Root interface. +// It represents a zip archive saving all its files int memory. +// It holds an httprange.Resource that can be read with httprange.RangedReader in chunks. +type zipArchive struct { + path string + once sync.Once + done chan struct{} + + resource *httprange.Resource + reader *httprange.RangedReader + archive *zip.Reader + err error + + files map[string]*zip.File +} + +func newArchive(path string) *zipArchive { + return &zipArchive{ + path: path, + done: make(chan struct{}), + files: make(map[string]*zip.File), + } +} + +func (a *zipArchive) openArchive(ctx context.Context) error { + a.once.Do(func() { + a.readArchive(ctx) + }) + + // wait for readArchive to be done or return when the context is canceled + select { + case <-a.done: + return a.err + case <-ctx.Done(): + err := ctx.Err() + log.WithError(err).Traceln("open zip archive timed out") + + return err + } +} + +func (a *zipArchive) readArchive(ctx context.Context) { + defer close(a.done) + + a.resource, a.err = httprange.NewResource(ctx, a.path) + if a.err != nil { + return + } + + // load all archive files into memory using a cached ranged reader + a.reader = httprange.NewRangedReader(a.resource) + a.reader.WithCachedReader(func() { + a.archive, a.err = zip.NewReader(a.reader, a.resource.Size) + }) + + if a.archive != nil { + for _, file := range a.archive.File { + if !strings.HasPrefix(file.Name, dirPrefix) { + continue + } + a.files[file.Name] = file + } + + // recycle memory + a.archive.File = nil + } +} + +func (a *zipArchive) findFile(name string) *zip.File { + name = filepath.Join(dirPrefix, name) + + if file := a.files[name]; file != nil { + return file + } + + if dir := a.files[name+"/"]; dir != nil { + return dir + } + + return nil +} + +// Open finds the file by name inside the zipArchive and returns a reader that can be served by the VFS +func (a *zipArchive) Open(ctx context.Context, name string) (vfs.File, error) { + file := a.findFile(name) + if file == nil { + return nil, os.ErrNotExist + } + + dataOffset, err := file.DataOffset() + if err != nil { + return nil, err + } + + // only read from dataOffset up to the size of the compressed file + reader := a.reader.SectionReader(dataOffset, int64(file.CompressedSize64)) + + switch file.Method { + case zip.Deflate: + return newDeflateReader(reader), nil + case zip.Store: + return reader, nil + default: + return nil, fmt.Errorf("unsupported compression method: %x", file.Method) + } +} + +// Lstat finds the file by name inside the zipArchive and returns its FileInfo +func (a *zipArchive) Lstat(ctx context.Context, name string) (os.FileInfo, error) { + file := a.findFile(name) + if file == nil { + return nil, os.ErrNotExist + } + + return file.FileInfo(), nil +} + +// ReadLink finds the file by name inside the zipArchive and returns the contents of the symlink +func (a *zipArchive) Readlink(ctx context.Context, name string) (string, error) { + file := a.findFile(name) + if file == nil { + return "", os.ErrNotExist + } + + if file.FileInfo().Mode()&os.ModeSymlink != os.ModeSymlink { + return "", errNotSymlink + } + + rc, err := file.Open() + if err != nil { + return "", err + } + defer rc.Close() + + symlink := make([]byte, maxSymlinkSize+1) + + // read up to len(symlink) bytes from the link file + n, err := io.ReadFull(rc, symlink) + if err != nil && err != io.ErrUnexpectedEOF { + // if err == io.ErrUnexpectedEOF the link is smaller than len(symlink) so it's OK to not return it + return "", err + } + + // return errSymlinkSize if the number of bytes read from the link is too big + if n > maxSymlinkSize { + return "", errSymlinkSize + } + + // only return the n bytes read from the link + return string(symlink[:n]), nil +} + +// close no-op: everything can be recycled by the GC +func (a *zipArchive) close() {} diff --git a/internal/vfs/zip/archive_test.go b/internal/vfs/zip/archive_test.go new file mode 100644 index 00000000..05eab3b6 --- /dev/null +++ b/internal/vfs/zip/archive_test.go @@ -0,0 +1,213 @@ +package zip + +import ( + "context" + "io/ioutil" + "net/http" + "net/http/httptest" + "os" + "testing" + + "github.com/stretchr/testify/require" + + "gitlab.com/gitlab-org/gitlab-pages/internal/testhelpers" +) + +var chdirSet = false + +func TestOpen(t *testing.T) { + zip, cleanup := openZipArchive(t) + defer cleanup() + + tests := map[string]struct { + file string + expectedContent string + expectedErr error + }{ + "file_exists": { + file: "index.html", + expectedContent: "zip.gitlab.io/project/index.html\n", + expectedErr: nil, + }, + "file_exists_in_subdir": { + file: "subdir/hello.html", + expectedContent: "zip.gitlab.io/project/subdir/hello.html\n", + expectedErr: nil, + }, + "file_exists_symlink": { + file: "symlink.html", + expectedContent: "subdir/linked.html", + expectedErr: nil, + }, + "is_dir": { + file: "subdir", + expectedContent: "zip.gitlab.io/project/index.html\n", + expectedErr: nil, + }, + "file_does_not_exist": { + file: "unknown.html", + expectedErr: os.ErrNotExist, + }, + } + + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + f, err := zip.Open(context.Background(), tt.file) + if tt.expectedErr != nil { + require.EqualError(t, err, tt.expectedErr.Error()) + return + } + + require.NoError(t, err) + + if name == "is_dir" { + // cannot ioutil.ReadAll dirs but zip.Open should not fail + return + } + + data, err := ioutil.ReadAll(f) + require.NoError(t, err) + + require.Equal(t, tt.expectedContent, string(data)) + require.NoError(t, f.Close()) + }) + } +} + +func TestLstat(t *testing.T) { + zip, cleanup := openZipArchive(t) + defer cleanup() + + tests := map[string]struct { + file string + isDir bool + isSymlink bool + expectedErr error + }{ + "file_exists": { + file: "index.html", + }, + "file_exists_in_subdir": { + file: "subdir/hello.html", + }, + "file_exists_symlink": { + file: "symlink.html", + isSymlink: true, + }, + "is_dir": { + file: "subdir", + isDir: true, + }, + "file_does_not_exist": { + file: "unknown.html", + expectedErr: os.ErrNotExist, + }, + } + + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + fi, err := zip.Lstat(context.Background(), tt.file) + if tt.expectedErr != nil { + require.EqualError(t, err, tt.expectedErr.Error()) + return + } + + require.NoError(t, err) + require.Contains(t, tt.file, fi.Name()) + require.Equal(t, tt.isDir, fi.IsDir()) + require.NotEmpty(t, fi.ModTime()) + + if tt.isDir { + require.Zero(t, fi.Size()) + require.True(t, fi.IsDir()) + return + } + + require.NotZero(t, fi.Size()) + + if tt.isSymlink { + require.NotZero(t, fi.Mode()&os.ModeSymlink) + } else { + require.True(t, fi.Mode().IsRegular()) + } + }) + } +} + +func TestReadLink(t *testing.T) { + zip, cleanup := openZipArchive(t) + defer cleanup() + + tests := map[string]struct { + file string + expectedErr error + }{ + "symlink_success": { + file: "symlink.html", + }, + "file": { + file: "index.html", + expectedErr: errNotSymlink, + }, + "dir": { + file: "subdir", + expectedErr: errNotSymlink, + }, + "symlink_too_big": { + file: "bad_symlink.html", + expectedErr: errSymlinkSize, + }, + "file_does_not_exist": { + file: "unknown.html", + expectedErr: os.ErrNotExist, + }, + } + + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + link, err := zip.Readlink(context.Background(), tt.file) + if tt.expectedErr != nil { + require.EqualError(t, err, tt.expectedErr.Error()) + return + } + + require.NoError(t, err) + require.NotEmpty(t, link) + }) + } +} +func openZipArchive(t *testing.T) (*zipArchive, func()) { + t.Helper() + + testServerURL, cleanup := newZipFileServerURL(t, "group/zip.gitlab.io/public.zip") + + zip := newArchive(testServerURL + "/public.zip") + + err := zip.openArchive(context.Background()) + require.NoError(t, err) + + // public/ public/index.html public/404.html public/symlink.html + // public/subdir/ public/subdir/hello.html public/subdir/linked.html + // public/bad_symlink.html public/subdir/2bp3Qzs... + require.NotZero(t, zip.files) + + return zip, func() { + zip.close() + cleanup() + } +} + +func newZipFileServerURL(t *testing.T, zipFilePath string) (string, func()) { + t.Helper() + + chdir := testhelpers.ChdirInPath(t, "../../../shared/pages", &chdirSet) + + testServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.ServeFile(w, r, zipFilePath) + })) + + return testServer.URL, func() { + chdir() + testServer.Close() + } +} diff --git a/internal/vfs/zip/deflate_reader.go b/internal/vfs/zip/deflate_reader.go new file mode 100644 index 00000000..2e55ee5a --- /dev/null +++ b/internal/vfs/zip/deflate_reader.go @@ -0,0 +1,27 @@ +package zip + +import ( + "compress/flate" + "io" +) + +type deflateReader struct { + R io.ReadCloser + D io.ReadCloser +} + +func (r *deflateReader) Read(p []byte) (n int, err error) { + return r.D.Read(p) +} + +func (r *deflateReader) Close() error { + r.R.Close() + return r.D.Close() +} + +func newDeflateReader(r io.ReadCloser) *deflateReader { + return &deflateReader{ + R: r, + D: flate.NewReader(r), + } +} -- cgit v1.2.3