diff options
author | Jacob Vosmaer <jacob@gitlab.com> | 2019-07-01 12:38:00 +0300 |
---|---|---|
committer | Jacob Vosmaer <jacob@gitlab.com> | 2019-07-01 12:38:00 +0300 |
commit | c52f057e907f7b59657d672452aee8f6acd2b7c7 (patch) | |
tree | f03c7e91af6186b1dc96097b0186a5a84f42ee20 | |
parent | 2760a2bbd607b8eb957c1c6e18c177a5e2c753ee (diff) |
Move more stuff to packfile
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | internal/git/packfile/bitmap.go | 99 | ||||
-rw-r--r-- | internal/git/packfile/index.go | 259 | ||||
-rw-r--r-- | parse-bitmap.go | 298 |
4 files changed, 382 insertions, 275 deletions
diff --git a/.gitignore b/.gitignore index b7fbb0b4c..5d5d76db0 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ git-env gitaly.pid /vendor/github.com/libgit2/git2go/vendor /vendor +/parse-bitmap diff --git a/internal/git/packfile/bitmap.go b/internal/git/packfile/bitmap.go index 971ed5405..6a5d40c55 100644 --- a/internal/git/packfile/bitmap.go +++ b/internal/git/packfile/bitmap.go @@ -1,18 +1,106 @@ package packfile import ( + "bufio" "encoding/binary" + "encoding/hex" "fmt" "io" "math" + "os" + + "gitlab.com/gitlab-org/gitaly/internal/git/gitio" ) +type Bitmap struct { + Commits *EWAH + Trees *EWAH + Blobs *EWAH + Tags *EWAH + nBitmapCommits uint32 +} + +func (idx *Index) LoadBitmap() error { + f, err := os.Open(idx.packBase + ".bitmap") + if err != nil { + return err + } + defer f.Close() + + r := bufio.NewReader(gitio.NewHashfileReader(f)) + + bmp := &Bitmap{} + bmp.nBitmapCommits, err = idx.parseBitmapHeader(r) + if err != nil { + return err + } + + for _, ptr := range []**EWAH{&bmp.Commits, &bmp.Trees, &bmp.Blobs, &bmp.Tags} { + *ptr, err = ReadEWAH(r) + if err != nil { + return err + } + } + for i := uint32(0); i < bmp.nBitmapCommits; i++ { + const entryHeaderLen = 6 + if _, err := r.Discard(entryHeaderLen); err != nil { + return err + } + + if _, err := ReadEWAH(r); err != nil { + return err + } + } + + // TODO handle hashcache + + if _, err := r.Peek(1); err != io.EOF { + return fmt.Errorf("expected EOF, got %v", err) + } + + idx.Bitmap = bmp + return nil +} + +func (idx *Index) parseBitmapHeader(r io.Reader) (uint32, error) { + const headerLen = 32 + header, err := readN(r, headerLen) + if err != nil { + return 0, err + } + + const sig = "BITM\x00\x01" + if actualSig := string(header[:len(sig)]); actualSig != sig { + return 0, fmt.Errorf("unexpected signature %q", actualSig) + } + header = header[len(sig):] + + const flagLen = 2 + flags := binary.BigEndian.Uint16(header[:flagLen]) + const minFlags = 1 + if flags&minFlags < minFlags { + return 0, fmt.Errorf("invalid flags %x", flags) + } + header = header[flagLen:] + + count := binary.BigEndian.Uint32(header[:4]) + header = header[4:] + + if s := hex.EncodeToString(header); s != idx.ID { + return 0, fmt.Errorf("unexpected pack ID in bitmap header: %s", s) + } + + return count, nil +} + type EWAH struct { raw []byte bits uint32 words uint32 } +const ewahTrailerLen = 4 + func ReadEWAH(r io.Reader) (*EWAH, error) { header := make([]byte, 8) if _, err := io.ReadFull(r, header); err != nil { @@ -24,7 +112,7 @@ func ReadEWAH(r io.Reader) (*EWAH, error) { words: binary.BigEndian.Uint32(header[4:]), } - rawSize := int64(e.words)*8 + 4 + rawSize := int64(e.words)*8 + ewahTrailerLen if rawSize > math.MaxInt32 { return nil, fmt.Errorf("EWAH bitmap does not fit in Go slice") } @@ -40,8 +128,9 @@ func ReadEWAH(r io.Reader) (*EWAH, error) { func (e *EWAH) Scan(f func(uint32) error) error { bit := uint32(0) - lastByte := int(8 * e.words) - for cursor := 0; cursor < lastByte; { + cursor := 0 + lastByte := len(e.raw) - ewahTrailerLen + for cursor < lastByte && bit < e.bits { header := binary.BigEndian.Uint64(e.raw[cursor : cursor+8]) cursor += 8 @@ -77,5 +166,9 @@ func (e *EWAH) Scan(f func(uint32) error) error { } } + if cursor != lastByte { + return fmt.Errorf("expected bitmap to use %d bytes, but consumed only %d", lastByte, cursor) + } + return nil } diff --git a/internal/git/packfile/index.go b/internal/git/packfile/index.go new file mode 100644 index 000000000..426bbac33 --- /dev/null +++ b/internal/git/packfile/index.go @@ -0,0 +1,259 @@ +package packfile + +import ( + "bufio" + "crypto/sha1" + "encoding/binary" + "encoding/hex" + "fmt" + "io" + "math" + "os" + "regexp" + "sort" + "strconv" + + "gitlab.com/gitlab-org/gitaly/internal/git/gitio" +) + +const sumSize = sha1.Size + +var ( + idxFileRegex = regexp.MustCompile(`\A(.*/pack-)([0-9a-f]{40})\.idx\z`) +) + +type Index struct { + ID string + packBase string + Objects []*Object + fanOut [256]int + *Bitmap +} + +func ReadIndex(idxPath string) (*Index, error) { + reMatches := idxFileRegex.FindStringSubmatch(idxPath) + if len(reMatches) == 0 { + return nil, fmt.Errorf("invalid idx filename: %q", idxPath) + } + + idx := &Index{ + packBase: reMatches[1] + reMatches[2], + ID: reMatches[2], + } + + f, err := os.Open(idx.packBase + ".idx") + if err != nil { + return nil, err + } + defer f.Close() + + r := bufio.NewReader(gitio.NewHashfileReader(f)) + + const sig = "\377tOc\x00\x00\x00\x02" + actualSig, err := readN(r, len(sig)) + if s := string(actualSig); s != sig { + return nil, fmt.Errorf("unexpected idx signature %q", s) + } + + count, err := idx.nPackObjects() + if err != nil { + return nil, err + } + + // TODO use a data structure other than a Go slice to hold the index + // entries? Go slices use int as their index type, and int may not be + // able to hold MaxUint32. + if count > math.MaxInt32 { + return nil, fmt.Errorf("too many objects in to fit in Go slice: %d", count) + } + idx.Objects = make([]*Object, count) + + for i := range idx.fanOut { + n, err := readUint32(r) + if err != nil { + return nil, err + } + + idx.fanOut[i] = int(n) // cast is safe because we know n<=len(idx.Objects) + } + + buf := make([]byte, sumSize) + for i := 0; i < len(idx.Objects); i++ { + if _, err := io.ReadFull(r, buf); err != nil { + return nil, err + } + idx.Objects[i] = &Object{OID: hex.EncodeToString(buf)} + } + + // Discard CRC32 values (one for each object) + for i := 0; i < len(idx.Objects); i++ { + if _, err := r.Discard(4); err != nil { + return nil, err + } + } + + // Read 4-byte offsets + has8ByteOffsets := false + for i := 0; i < len(idx.Objects); i++ { + offset, err := readUint32(r) + if err != nil { + return nil, err + } + + const mask = 1 << 31 + if offset&mask == mask { + has8ByteOffsets = true + continue + } + + idx.Objects[i].Offset = uint64(offset) + } + + if has8ByteOffsets { + for i := 0; i < len(idx.Objects); i++ { + offset, err := readUint64(r) + if err != nil { + return nil, err + } + + // TODO Not clear if all 8-byte offsets are populated, or only those that + // don't fit into 4 bytes. + if offset > 0 { + idx.Objects[i].Offset = offset + } + } + } + + idxPackID, err := readN(r, sumSize) + if err != nil { + return nil, err + } + + if s := hex.EncodeToString(idxPackID); s != idx.ID { + return nil, fmt.Errorf("unexpected pack ID in idx: %s", s) + } + + if _, err := r.Peek(1); err != io.EOF { + if err == nil { + err = fmt.Errorf("unexpected trailing data, expected EOF") + } + return nil, err + } + + return idx, nil +} + +func (idx *Index) GetObject(oid string) (*Object, bool) { + if len(oid) < 2 { + return nil, false + } + + radix64, err := strconv.ParseInt(oid[:2], 16, 0) + if err != nil { + return nil, false + } + + radix := int(radix64) + last := idx.fanOut[radix] + first := 0 + if radix > 0 { + first = idx.fanOut[radix-1] + } + + objRange := idx.Objects[first:last] + objIdx := sort.Search(len(objRange), func(i int) bool { + return objRange[i].OID >= oid + }) + if objIdx == len(objRange) { + return nil, false + } + obj := objRange[objIdx] + + if obj.OID != oid { + return nil, false + } + + return obj, true +} + +func (idx *Index) nPackObjects() (uint32, error) { + f, err := idx.openPack() + if err != nil { + return 0, err + } + defer f.Close() + + const headerLen = 12 + header, err := readN(f, headerLen) + if err != nil { + return 0, err + } + + const sig = "PACK\x00\x00\x00\x02" + if s := string(header[:len(sig)]); s != sig { + return 0, fmt.Errorf("unexpected pack signature %q", s) + } + header = header[len(sig):] + + return binary.BigEndian.Uint32(header), nil +} + +func (idx *Index) openPack() (f *os.File, err error) { + packPath := idx.packBase + ".pack" + f, err = os.Open(packPath) + if err != nil { + return nil, err + } + + defer func(f *os.File) { + if err != nil { + f.Close() + } + }(f) // Bind f early so that we can do "return nil, err". + + if _, err := f.Seek(-sumSize, io.SeekEnd); err != nil { + return nil, err + } + + sum, err := readN(f, sumSize) + if err != nil { + return nil, err + } + + if s := hex.EncodeToString(sum); s != idx.ID { + return nil, fmt.Errorf("unexpected trailing checksum in .pack: %s", s) + } + + if _, err := f.Seek(0, io.SeekStart); err != nil { + return nil, err + } + + return f, nil +} + +func readUint32(r io.Reader) (uint32, error) { + buf, err := readN(r, 4) + if err != nil { + return 0, err + } + + return binary.BigEndian.Uint32(buf), nil +} + +func readUint64(r io.Reader) (uint64, error) { + buf, err := readN(r, 8) + if err != nil { + return 0, err + } + + return binary.BigEndian.Uint64(buf), nil +} + +func readN(r io.Reader, n int) ([]byte, error) { + buf := make([]byte, n) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, err + } + + return buf, nil +} diff --git a/parse-bitmap.go b/parse-bitmap.go index a9773e5ca..9163dac23 100644 --- a/parse-bitmap.go +++ b/parse-bitmap.go @@ -2,18 +2,14 @@ package main import ( "bufio" - "encoding/binary" - "encoding/hex" "flag" "fmt" - "io" "log" - "math" "os" "regexp" "sort" + "time" - "gitlab.com/gitlab-org/gitaly/internal/git/gitio" "gitlab.com/gitlab-org/gitaly/internal/git/packfile" ) @@ -35,65 +31,54 @@ func main() { } func _main(packIdx string) error { - reMatches := idxFileRegex.FindStringSubmatch(packIdx) - if len(reMatches) == 0 { - return fmt.Errorf("invalid idx filename: %q", packIdx) - } - - packBase := reMatches[1] + reMatches[2] - packID := reMatches[2] - - log.Print("parsing idx") - idxObjects, err := readIndex(packBase, packID) + start := time.Now() + idx, err := packfile.ReadIndex(packIdx) if err != nil { return err } - log.Printf("found %d objects", len(idxObjects)) + log.Printf("loaded idx with %d objects in %v", len(idx.Objects), time.Since(start)) - f, err := os.Open(packBase + ".bitmap") - if err != nil { - return err - } - defer f.Close() - - r := bufio.NewReader(gitio.NewHashfileReader(f)) - - nBitmapCommits, err := parseBitmapHeader(r, packID) - if err != nil { + start = time.Now() + if err := idx.LoadBitmap(); err != nil { return err } + log.Printf("loaded bitmap in %v", time.Since(start)) + start = time.Now() // Sort objects by pack offset, because the object type bitmaps use // packfile order. - packObjects := make([]*packfile.Object, len(idxObjects)) - copy(packObjects, idxObjects) + packObjects := make([]*packfile.Object, len(idx.Objects)) + copy(packObjects, idx.Objects) - log.Print("sorting object list") sort.Sort(packfile.PackfileOrder(packObjects)) - - // The type bitmaps come in this specific order: commit, tree, blob, tag. - log.Print("labeling objects") - for _, t := range []packfile.ObjectType{packfile.TCommit, packfile.TTree, packfile.TBlob, packfile.TTag} { - ewah, err := packfile.ReadEWAH(r) - if err != nil { - return err - } - + log.Printf("sorted object list in %v", time.Since(start)) + + start = time.Now() + for _, t := range []struct { + objectType packfile.ObjectType + ewah *packfile.EWAH + }{ + {packfile.TCommit, idx.Bitmap.Commits}, + {packfile.TTree, idx.Bitmap.Trees}, + {packfile.TBlob, idx.Bitmap.Blobs}, + {packfile.TTag, idx.Bitmap.Tags}, + } { setFunc := func(i uint32) error { obj := packObjects[i] if obj.Type != packfile.TUnknown { return fmt.Errorf("type already set for object %v", obj) } - obj.Type = t + obj.Type = t.objectType return nil } - if err := ewah.Scan(setFunc); err != nil { + if err := t.ewah.Scan(setFunc); err != nil { return err } } + log.Printf("labeled objects in %v", time.Since(start)) for _, obj := range packObjects { if obj.Type == packfile.TUnknown { @@ -101,25 +86,10 @@ func _main(packIdx string) error { } } - for i := uint32(0); i < nBitmapCommits; i++ { - const entryHeaderLen = 6 - if _, err := r.Discard(entryHeaderLen); err != nil { - return err - } - - if _, err := packfile.ReadEWAH(r); err != nil { - return err - } - } - - if _, err := r.Peek(1); err != io.EOF { - return fmt.Errorf("expected EOF, got %v", err) - } - out := bufio.NewWriter(os.Stdout) defer out.Flush() + fmt.Fprintf(out, "# pack-%s\n", idx.ID) - fmt.Fprintf(out, "# pack-%s\n", packID) for _, o := range packObjects { if *printMap { c := "" @@ -142,219 +112,3 @@ func _main(packIdx string) error { return nil } - -func parseBitmapHeader(r io.Reader, packID string) (uint32, error) { - const headerLen = 32 - header, err := readN(r, headerLen) - if err != nil { - return 0, err - } - - const sig = "BITM\x00\x01" - if actualSig := string(header[:len(sig)]); actualSig != sig { - return 0, fmt.Errorf("unexpected signature %q", actualSig) - } - header = header[len(sig):] - - const flagLen = 2 - flags := binary.BigEndian.Uint16(header[:flagLen]) - const minFlags = 1 - if flags&minFlags < minFlags { - return 0, fmt.Errorf("invalid flags %x", flags) - } - header = header[flagLen:] - - count := binary.BigEndian.Uint32(header[:4]) - header = header[4:] - - if s := hex.EncodeToString(header); s != packID { - return 0, fmt.Errorf("unexpected pack ID in bitmap header: %s", s) - } - - return count, nil -} - -const sumSize = 20 - -func readIndex(packBase, packID string) ([]*packfile.Object, error) { - f, err := os.Open(packBase + ".idx") - if err != nil { - return nil, err - } - defer f.Close() - - r := bufio.NewReader(gitio.NewHashfileReader(f)) - - const sig = "\377tOc\x00\x00\x00\x02" - actualSig, err := readN(r, len(sig)) - if s := string(actualSig); s != sig { - return nil, fmt.Errorf("unexpected idx signature %q", s) - } - - count, err := nPackObjects(packBase, packID) - if err != nil { - return nil, err - } - - // Fanout is used to speed up random access to the index. We are doing a - // sequential read so we don't need the fanout table. - const fanOutLen = 4 * 256 - if _, err := r.Discard(fanOutLen); err != nil { - return nil, err - } - - // TODO use a data structure other than a Go slice to hold the index - // entries? Go slices use int as their index type, and int may not be - // able to hold MaxUint32. - if count > math.MaxInt32 { - return nil, fmt.Errorf("too many objects in packfile to fit in Go slice: %d", count) - } - objects := make([]*packfile.Object, count) - - buf := make([]byte, sumSize) - for i := 0; i < len(objects); i++ { - if _, err := io.ReadFull(r, buf); err != nil { - return nil, err - } - objects[i] = &packfile.Object{OID: hex.EncodeToString(buf)} - } - - // Discard CRC32 values (one for each object) - for i := 0; i < len(objects); i++ { - if _, err := r.Discard(4); err != nil { - return nil, err - } - } - - // Read 4-byte offsets - has8ByteOffsets := false - for i := 0; i < len(objects); i++ { - offset, err := readUint32(r) - if err != nil { - return nil, err - } - - const mask = 1 << 31 - if offset&mask == mask { - has8ByteOffsets = true - continue - } - - objects[i].Offset = uint64(offset) - } - - if has8ByteOffsets { - for i := 0; i < len(objects); i++ { - offset, err := readUint64(r) - if err != nil { - return nil, err - } - - // TODO Not clear if all 8-byte offsets are populated, or only those that - // don't fit into 4 bytes. - if offset > 0 { - objects[i].Offset = offset - } - } - } - - idxPackID, err := readN(r, sumSize) - if err != nil { - return nil, err - } - - if s := hex.EncodeToString(idxPackID); s != packID { - return nil, fmt.Errorf("unexpected pack ID in idx: %s", s) - } - - if _, err := r.Peek(1); err != io.EOF { - if err == nil { - err = fmt.Errorf("unexpected trailing data, expected EOF") - } - return nil, err - } - - return objects, nil -} - -func readN(r io.Reader, n int) ([]byte, error) { - buf := make([]byte, n) - if _, err := io.ReadFull(r, buf); err != nil { - return nil, err - } - - return buf, nil -} - -func nPackObjects(packBase, packID string) (uint32, error) { - f, err := openPack(packBase, packID) - if err != nil { - return 0, err - } - defer f.Close() - - const headerLen = 12 - header, err := readN(f, headerLen) - if err != nil { - return 0, err - } - - const sig = "PACK\x00\x00\x00\x02" - if s := string(header[:len(sig)]); s != sig { - return 0, fmt.Errorf("unexpected pack signature %q", s) - } - header = header[len(sig):] - - return binary.BigEndian.Uint32(header), nil -} - -func openPack(packBase, packID string) (f *os.File, err error) { - packPath := packBase + ".pack" - f, err = os.Open(packPath) - if err != nil { - return nil, err - } - - defer func(f *os.File) { - if err != nil { - f.Close() - } - }(f) // Bind f early so that we can do "return nil, err". - - if _, err := f.Seek(-sumSize, io.SeekEnd); err != nil { - return nil, err - } - - sum, err := readN(f, sumSize) - if err != nil { - return nil, err - } - - if s := hex.EncodeToString(sum); s != packID { - return nil, fmt.Errorf("unexpected trailing checksum in .pack: %s", s) - } - - if _, err := f.Seek(0, io.SeekStart); err != nil { - return nil, err - } - - return f, nil -} - -func readUint32(r io.Reader) (uint32, error) { - buf, err := readN(r, 4) - if err != nil { - return 0, err - } - - return binary.BigEndian.Uint32(buf), nil -} - -func readUint64(r io.Reader) (uint64, error) { - buf, err := readN(r, 8) - if err != nil { - return 0, err - } - - return binary.BigEndian.Uint64(buf), nil -} |