diff options
Diffstat (limited to 'internal/git')
-rw-r--r-- | internal/git/packfile/bitmap.go | 287 | ||||
-rw-r--r-- | internal/git/packfile/index.go | 265 | ||||
-rw-r--r-- | internal/git/packfile/object.go | 42 |
3 files changed, 594 insertions, 0 deletions
diff --git a/internal/git/packfile/bitmap.go b/internal/git/packfile/bitmap.go new file mode 100644 index 000000000..fa35ec4b6 --- /dev/null +++ b/internal/git/packfile/bitmap.go @@ -0,0 +1,287 @@ +package packfile + +import ( + "bufio" + "bytes" + "encoding/binary" + "encoding/hex" + "fmt" + "io" + "math" + "math/big" + "os" + + "gitlab.com/gitlab-org/gitaly/internal/git/gitio" +) + +// IndexBitmap is the in-memory representation of a .bitmap file. +type IndexBitmap struct { + Commits *Bitmap + Trees *Bitmap + Blobs *Bitmap + Tags *Bitmap + bitmapCommits []*BitmapCommit + flags int +} + +// BitmapCommit represents a bitmapped commit, i.e. a commit in the +// packfile plus a bitmap indicating which objects are reachable from +// that commit. +type BitmapCommit struct { + OID string + *Bitmap + xorOffset byte + flags byte +} + +// LoadBitmap opens the .bitmap file corresponding to idx and loads it +// into memory. Returns an error if there is no .bitmap. +func (idx *Index) LoadBitmap() error { + if idx.IndexBitmap != nil { + return nil + } + + f, err := os.Open(idx.packBase + ".bitmap") + if err != nil { + return err + } + defer f.Close() + + r := bufio.NewReader(gitio.NewHashfileReader(f)) + + ib := &IndexBitmap{} + if err := ib.parseIndexBitmapHeader(r, idx); err != nil { + return err + } + + for _, ptr := range []**Bitmap{&ib.Commits, &ib.Trees, &ib.Blobs, &ib.Tags} { + *ptr, err = ReadEWAH(r) + if err != nil { + return err + } + + if err := (*ptr).Unpack(); err != nil { + return err + } + } + + for i := range ib.bitmapCommits { + header, err := readN(r, 6) + if err != nil { + return err + } + + bc := &BitmapCommit{ + OID: idx.Objects[binary.BigEndian.Uint32(header[:4])].OID, + xorOffset: header[4], + flags: header[5], + } + + if bc.Bitmap, err = ReadEWAH(r); err != nil { + return err + } + + ib.bitmapCommits[i] = bc + } + + if ib.flags&bitmapOptHashCache > 0 { + // Discard bitmap hash cache + for range idx.Objects { + if _, err := r.Discard(4); err != nil { + return err + } + } + } + + if _, err := r.Peek(1); err != io.EOF { + return fmt.Errorf("expected EOF, got %v", err) + } + + idx.IndexBitmap = ib + return nil +} + +const ( + bitmapOptFullDAG = 1 // BITMAP_OPT_FULL_DAG + bitmapOptHashCache = 4 // BITMAP_OPT_HASH_CACHE +) + +func (ib *IndexBitmap) parseIndexBitmapHeader(r io.Reader, idx *Index) error { + const headerLen = 32 + header, err := readN(r, headerLen) + if err != nil { + return err + } + + const sig = "BITM\x00\x01" + if actualSig := string(header[:len(sig)]); actualSig != sig { + return fmt.Errorf("unexpected signature %q", actualSig) + } + header = header[len(sig):] + + const flagLen = 2 + ib.flags = int(binary.BigEndian.Uint16(header[:flagLen])) + header = header[flagLen:] + + const knownFlags = bitmapOptFullDAG | bitmapOptHashCache + if ib.flags&^knownFlags != 0 || (ib.flags&bitmapOptFullDAG == 0) { + return fmt.Errorf("invalid flags %x", ib.flags) + } + + const countLen = 4 + count := binary.BigEndian.Uint32(header[:countLen]) + header = header[countLen:] + ib.bitmapCommits = make([]*BitmapCommit, count) + + if s := hex.EncodeToString(header); s != idx.ID { + return fmt.Errorf("unexpected pack ID in bitmap header: %s", s) + } + + return nil +} + +// NumBitmapCommits returns the number of indexed commits in the .bitmap file. +func (ib *IndexBitmap) NumBitmapCommits() int { return len(ib.bitmapCommits) } + +// BitmapCommit retrieves a bitmap commit, along with its bitmap. If the +// bitmap is XOR-compressed this will decompress it. +func (ib *IndexBitmap) BitmapCommit(i int) (*BitmapCommit, error) { + if i >= ib.NumBitmapCommits() { + return nil, fmt.Errorf("bitmap commit index %d out of range", i) + } + + // This is wasteful but correct: bitmap commit i may depend, via XOR, on + // a chain of preceding commits j_0,..., j_m < i. Instead of finding that + // chain we just build and XOR all commits up to and including i. + for j, bc := range ib.bitmapCommits[:i+1] { + if bc.Bitmap.bm != nil { + continue + } + + if err := bc.Bitmap.Unpack(); err != nil { + return nil, err + } + + if k := int(bc.xorOffset); k > 0 { + bm := bc.Bitmap.bm + bm.Xor(bm, ib.bitmapCommits[j-k].Bitmap.bm) + } + } + + return ib.bitmapCommits[i], nil +} + +// Bitmap represents a bitmap as used in a packfile .bitmap file. +type Bitmap struct { + bits int + words int + raw []byte + bm *big.Int +} + +// ReadEWAH parses an EWAH-compressed bitmap into a *Bitmap. +func ReadEWAH(r io.Reader) (*Bitmap, error) { + header := make([]byte, 8) + if _, err := io.ReadFull(r, header); err != nil { + return nil, err + } + + e := &Bitmap{} + + uBits := binary.BigEndian.Uint32(header[:4]) + if uBits > math.MaxInt32 { + return nil, fmt.Errorf("too many bits in bitmap: %d", uBits) + } + e.bits = int(uBits) + + uWords := binary.BigEndian.Uint32(header[4:]) + if uWords > math.MaxInt32 { + return nil, fmt.Errorf("too many words in bitmap: %d", uWords) + } + e.words = int(uWords) + + const ewahTrailerLen = 4 + rawSize := int64(e.words)*8 + ewahTrailerLen + if rawSize > math.MaxInt32 { + return nil, fmt.Errorf("bitmap does not fit in Go slice") + } + + e.raw = make([]byte, int(rawSize)) + + if _, err := io.ReadFull(r, e.raw); err != nil { + return nil, err + } + + return e, nil +} + +// Unpack expands e.raw, which is EWAH-compressed, into an uncompressed *big.Int. +func (e *Bitmap) Unpack() error { + if e.bm != nil { + return nil + } + + const ( + wordSize = 8 + wordBits = 8 * wordSize + ) + + nUnpackedWords := e.bits / wordBits + if e.bits%wordBits > 0 { + nUnpackedWords++ + } + + buf := make([]byte, nUnpackedWords*wordSize) + bufPos := len(buf) + + fillOnes := bytes.Repeat([]byte{0xff}, wordSize) + + for i := 0; i < e.words; { + header := binary.BigEndian.Uint64(e.raw[wordSize*i : wordSize*(i+1)]) + i++ + + cleanBit := int(header & 1) + nClean := uint32(header >> 1) + nDirty := uint32(header >> 33) + + for ; nClean > 0; nClean-- { + // If cleanBit == 0 we don't have to do anything, because each byte in + // buf is initially zero. + if cleanBit == 1 { + copy( + buf[bufPos-wordSize:bufPos], + fillOnes, + ) + } + + bufPos -= wordSize + } + + for ; nDirty > 0; nDirty-- { + copy( + buf[bufPos-wordSize:bufPos], + e.raw[wordSize*i:wordSize*(i+1)], + ) + bufPos -= wordSize + i++ + } + } + + e.bm = big.NewInt(0) + e.bm.SetBytes(buf) + + return nil +} + +// Scan traverses the bitmap and calls f for each bit which is 1. +func (e *Bitmap) Scan(f func(int) error) error { + for i := 0; i < e.bits; i++ { + if e.bm.Bit(i) == 1 { + if err := f(i); err != nil { + return err + } + } + } + + return nil +} diff --git a/internal/git/packfile/index.go b/internal/git/packfile/index.go new file mode 100644 index 000000000..6597c8949 --- /dev/null +++ b/internal/git/packfile/index.go @@ -0,0 +1,265 @@ +package packfile + +import ( + "bufio" + "context" + "crypto/sha1" + "encoding/binary" + "encoding/hex" + "fmt" + "io" + "math" + "os" + "os/exec" + "regexp" + "sort" + "strconv" + "strings" + + "gitlab.com/gitlab-org/gitaly/internal/command" +) + +const sumSize = sha1.Size + +var ( + idxFileRegex = regexp.MustCompile(`\A(.*/pack-)([0-9a-f]{40})\.idx\z`) +) + +// Index is an in-memory representation of a packfile .idx file. +type Index struct { + // ID is the packfile ID. For pack-123abc.idx, this would be 123abc. + ID string + packBase string + // Objects holds the list of objects in the packfile in index order, i.e. sorted by OID + Objects []*Object + // Objects holds the list of objects in the packfile in packfile order, i.e. sorted by packfile offset + PackfileOrder []*Object + *IndexBitmap +} + +// ReadIndex opens a packfile .idx file and loads its contents into +// memory. In doing so it will also open and read small amounts of data +// from the .pack file itself. +func ReadIndex(idxPath string) (*Index, error) { + reMatches := idxFileRegex.FindStringSubmatch(idxPath) + if len(reMatches) == 0 { + return nil, fmt.Errorf("invalid idx filename: %q", idxPath) + } + + idx := &Index{ + packBase: reMatches[1] + reMatches[2], + ID: reMatches[2], + } + + f, err := os.Open(idx.packBase + ".idx") + if err != nil { + return nil, err + } + defer f.Close() + + if _, err := f.Seek(-2*sumSize, io.SeekEnd); err != nil { + return nil, err + } + + packID, err := readN(f, sumSize) + if err != nil { + return nil, err + } + + if actual := hex.EncodeToString(packID); idx.ID != actual { + return nil, fmt.Errorf("expected idx to go with pack %s, got %s", idx.ID, actual) + } + + count, err := idx.numPackObjects() + if err != nil { + return nil, err + } + + // TODO use a data structure other than a Go slice to hold the index + // entries? Go slices use int as their index type, and int may not be + // able to hold MaxUint32. + if count > math.MaxInt32 { + return nil, fmt.Errorf("too many objects in to fit in Go slice: %d", count) + } + idx.Objects = make([]*Object, count) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + if _, err := f.Seek(0, io.SeekStart); err != nil { + return nil, err + } + + showIndex, err := command.New(ctx, exec.Command("git", "show-index"), f, nil, nil) + if err != nil { + return nil, err + } + + scanner := bufio.NewScanner(showIndex) + i := 0 + for ; scanner.Scan(); i++ { + line := scanner.Text() + split := strings.SplitN(line, " ", 3) + if len(split) != 3 { + return nil, fmt.Errorf("unable to parse show-index line: %q", line) + } + + offset, err := strconv.ParseUint(split[0], 10, 64) + if err != nil { + return nil, err + } + oid := split[1] + + idx.Objects[i] = &Object{OID: oid, Offset: offset} + } + + if err := scanner.Err(); err != nil { + return nil, err + } + + if err := showIndex.Wait(); err != nil { + return nil, err + } + + if i != len(idx.Objects) { + return nil, fmt.Errorf("expected %d objects in output of git show-index, got %d", len(idx.Objects), i) + } + + return idx, nil +} + +func (idx *Index) numPackObjects() (uint32, error) { + f, err := idx.openPack() + if err != nil { + return 0, err + } + defer f.Close() + + const sizeOffset = 8 + if _, err := f.Seek(sizeOffset, io.SeekStart); err != nil { + return 0, err + } + + return readUint32(f) +} + +func (idx *Index) openPack() (f *os.File, err error) { + packPath := idx.packBase + ".pack" + f, err = os.Open(packPath) + if err != nil { + return nil, err + } + + defer func(f *os.File) { + if err != nil { + f.Close() + } + }(f) // Bind f early so that we can do "return nil, err". + + const headerLen = 8 + header, err := readN(f, headerLen) + if err != nil { + return nil, err + } + + const sig = "PACK\x00\x00\x00\x02" + if s := string(header); s != sig { + return nil, fmt.Errorf("unexpected pack signature %q", s) + } + + if _, err := f.Seek(-sumSize, io.SeekEnd); err != nil { + return nil, err + } + + sum, err := readN(f, sumSize) + if err != nil { + return nil, err + } + + if s := hex.EncodeToString(sum); s != idx.ID { + return nil, fmt.Errorf("unexpected trailing checksum in .pack: %s", s) + } + + if _, err := f.Seek(0, io.SeekStart); err != nil { + return nil, err + } + + return f, nil +} + +func readUint32(r io.Reader) (uint32, error) { + buf, err := readN(r, 4) + if err != nil { + return 0, err + } + + return binary.BigEndian.Uint32(buf), nil +} + +func readN(r io.Reader, n int) ([]byte, error) { + buf := make([]byte, n) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, err + } + + return buf, nil +} + +// BuildPackfileOrder populates the PackfileOrder field. +func (idx *Index) BuildPackfileOrder() { + if len(idx.PackfileOrder) > 0 { + return + } + + idx.PackfileOrder = make([]*Object, len(idx.Objects)) + copy(idx.PackfileOrder, idx.Objects) + sort.Sort(offsetOrder(idx.PackfileOrder)) +} + +type offsetOrder []*Object + +func (oo offsetOrder) Len() int { return len(oo) } +func (oo offsetOrder) Less(i, j int) bool { return oo[i].Offset < oo[j].Offset } +func (oo offsetOrder) Swap(i, j int) { oo[i], oo[j] = oo[j], oo[i] } + +// LabelObjectTypes tries to label each object in the index with its +// object type, using the packfile bitmap. Returns an error if there is +// no packfile .bitmap file. +func (idx *Index) LabelObjectTypes() error { + if err := idx.LoadBitmap(); err != nil { + return err + } + + idx.BuildPackfileOrder() + + for _, t := range []struct { + objectType ObjectType + bmp *Bitmap + }{ + {TCommit, idx.IndexBitmap.Commits}, + {TTree, idx.IndexBitmap.Trees}, + {TBlob, idx.IndexBitmap.Blobs}, + {TTag, idx.IndexBitmap.Tags}, + } { + if err := t.bmp.Scan(func(i int) error { + obj := idx.PackfileOrder[i] + if obj.Type != TUnknown { + return fmt.Errorf("type already set for object %v", obj) + } + + obj.Type = t.objectType + + return nil + }); err != nil { + return err + } + } + + for _, obj := range idx.PackfileOrder { + if obj.Type == TUnknown { + return fmt.Errorf("object missing type label: %v", obj) + } + } + + return nil +} diff --git a/internal/git/packfile/object.go b/internal/git/packfile/object.go new file mode 100644 index 000000000..b9598bdb6 --- /dev/null +++ b/internal/git/packfile/object.go @@ -0,0 +1,42 @@ +package packfile + +import "fmt" + +// ObjectType is used to label index entries as commits, trees, blobs or tags. +type ObjectType byte + +const ( + // TUnknown is a sentinel indicating an object has not been labeled yet + TUnknown ObjectType = iota + // TBlob means Git blob + TBlob + // TCommit means Git commit + TCommit + // TTree means Git tree + TTree + // TTag means Git tag + TTag +) + +// Object represents a Git packfile index entry, optionally decorated with its object type. +type Object struct { + OID string + Type ObjectType + Offset uint64 +} + +func (o Object) String() string { + t := "unknown" + switch o.Type { + case TBlob: + t = "blob" + case TCommit: + t = "commit" + case TTree: + t = "tree" + case TTag: + t = "tag" + } + + return fmt.Sprintf("%s %s\t%d", o.OID, t, o.Offset) +} |