Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Cai <jcai@gitlab.com>2019-08-15 00:25:56 +0300
committerJohn Cai <jcai@gitlab.com>2019-08-15 00:25:56 +0300
commite091fb7fe40d97c7eecba9d9dded027393261e7e (patch)
treed3aedefbfdb8fa6ad28629ed46aed38897ef373f
parentde79eca45626d14ff1283ec712bc6e040d00620c (diff)
parent6cd8869dd5536d1c468619f8e06783a1fc0450bc (diff)
Merge branch 'jv-parse-bitmap' into 'master'
Add packfile analyzer See merge request gitlab-org/gitaly!1256
-rw-r--r--cmd/gitaly-debug/bitmap.go102
-rw-r--r--cmd/gitaly-debug/main.go36
-rw-r--r--internal/git/packfile/bitmap.go287
-rw-r--r--internal/git/packfile/index.go265
-rw-r--r--internal/git/packfile/object.go42
5 files changed, 732 insertions, 0 deletions
diff --git a/cmd/gitaly-debug/bitmap.go b/cmd/gitaly-debug/bitmap.go
new file mode 100644
index 000000000..a8d768df1
--- /dev/null
+++ b/cmd/gitaly-debug/bitmap.go
@@ -0,0 +1,102 @@
+package main
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+
+ "gitlab.com/gitlab-org/gitaly/internal/git/packfile"
+)
+
+func listBitmapPack(idxFile string) {
+ idx, err := packfile.ReadIndex(idxFile)
+ noError(err)
+
+ noError(idx.LabelObjectTypes())
+
+ out := bufio.NewWriter(os.Stdout)
+ defer out.Flush()
+ fmt.Fprintf(out, "# pack-%s\n", idx.ID)
+
+ for _, o := range idx.PackfileOrder {
+ fmt.Fprintln(out, o)
+ }
+}
+
+func mapBitmapPack(idxFile string) {
+ idx, err := packfile.ReadIndex(idxFile)
+ noError(err)
+
+ noError(idx.LabelObjectTypes())
+
+ out := bufio.NewWriter(os.Stdout)
+ defer out.Flush()
+ fmt.Fprintf(out, "# pack-%s\n", idx.ID)
+ // Use a mix of lower and upper case that is easier to distinguish than all upper / all lower.
+ fmt.Fprintln(out, "# b: blob, C: commit, e: tree, T: tag")
+
+ for _, o := range idx.PackfileOrder {
+ c := ""
+ switch o.Type {
+ case packfile.TBlob:
+ c = "b"
+ case packfile.TCommit:
+ c = "C"
+ case packfile.TTree:
+ c = "e"
+ case packfile.TTag:
+ c = "T"
+ }
+
+ fmt.Fprint(out, c+" ")
+ }
+}
+
+func listBitmapCommits(idxFile string) {
+ idx, err := packfile.ReadIndex(idxFile)
+ noError(err)
+
+ noError(idx.LabelObjectTypes())
+
+ out := bufio.NewWriter(os.Stdout)
+ defer out.Flush()
+ fmt.Fprintf(out, "# pack-%s\n", idx.ID)
+
+ for i := 0; i < idx.IndexBitmap.NumBitmapCommits(); i++ {
+ bc, err := idx.IndexBitmap.BitmapCommit(i)
+ noError(err)
+ fmt.Fprintln(out, bc.OID)
+ }
+}
+
+func listBitmapReachable(idxFile string, commitID string) {
+ idx, err := packfile.ReadIndex(idxFile)
+ noError(err)
+
+ noError(idx.LabelObjectTypes())
+
+ var bc *packfile.BitmapCommit
+ for i := 0; i < idx.IndexBitmap.NumBitmapCommits(); i++ {
+ var err error
+ bc, err = idx.BitmapCommit(i)
+ noError(err)
+ if bc.OID == commitID {
+ break
+ }
+ }
+
+ if bc == nil || bc.OID != commitID {
+ fatal(fmt.Errorf("bitmap commit not found: %s", commitID))
+ }
+
+ out := bufio.NewWriter(os.Stdout)
+ defer out.Flush()
+ fmt.Fprintf(out, "# pack-%s\n", idx.ID)
+
+ fmt.Fprintf(out, "# bitmap commit %s\n", bc.OID)
+
+ noError(bc.Bitmap.Scan(func(i int) error {
+ _, err := fmt.Fprintln(out, idx.PackfileOrder[i])
+ return err
+ }))
+}
diff --git a/cmd/gitaly-debug/main.go b/cmd/gitaly-debug/main.go
index 96361caa6..2e6d6f49f 100644
--- a/cmd/gitaly-debug/main.go
+++ b/cmd/gitaly-debug/main.go
@@ -18,6 +18,22 @@ simulate-http-clone GIT_DIR
analyze-http-clone HTTP_URL
Clones a Git repository from a public HTTP URL into /dev/null.
+
+list-bitmap-pack IDX_FILE
+ List contents of packfile corresponding to IDX_FILE in offset
+ order. Only works if there also is a .bitmap file for this
+ pack.
+
+map-bitmap-pack IDX_FILE
+ Print map visualization of contents of packfile corresponding
+ to IDX_FILE in offset order. Only works if there also is a
+ .bitmap file for this pack.
+
+list-bitmap-commits IDX_FILE
+ List bitmap commits for .idx / .bitmap pair.
+
+list-bitmap-reachable IDX_FILE BITMAP_COMMIT_ID
+ List commits reachable from a bitmapped commit in a packfile.
`
)
@@ -38,6 +54,26 @@ func main() {
fatal(usage)
}
analyzeHTTPClone(extraArgs[0])
+ case "list-bitmap-pack":
+ if len(extraArgs) != 1 {
+ fatal(usage)
+ }
+ listBitmapPack(extraArgs[0])
+ case "map-bitmap-pack":
+ if len(extraArgs) != 1 {
+ fatal(usage)
+ }
+ mapBitmapPack(extraArgs[0])
+ case "list-bitmap-commits":
+ if len(extraArgs) != 1 {
+ fatal(usage)
+ }
+ listBitmapCommits(extraArgs[0])
+ case "list-bitmap-reachable":
+ if len(extraArgs) != 2 {
+ fatal(usage)
+ }
+ listBitmapReachable(extraArgs[0], extraArgs[1])
default:
fatal(usage)
}
diff --git a/internal/git/packfile/bitmap.go b/internal/git/packfile/bitmap.go
new file mode 100644
index 000000000..fa35ec4b6
--- /dev/null
+++ b/internal/git/packfile/bitmap.go
@@ -0,0 +1,287 @@
+package packfile
+
+import (
+ "bufio"
+ "bytes"
+ "encoding/binary"
+ "encoding/hex"
+ "fmt"
+ "io"
+ "math"
+ "math/big"
+ "os"
+
+ "gitlab.com/gitlab-org/gitaly/internal/git/gitio"
+)
+
+// IndexBitmap is the in-memory representation of a .bitmap file.
+type IndexBitmap struct {
+ Commits *Bitmap
+ Trees *Bitmap
+ Blobs *Bitmap
+ Tags *Bitmap
+ bitmapCommits []*BitmapCommit
+ flags int
+}
+
+// BitmapCommit represents a bitmapped commit, i.e. a commit in the
+// packfile plus a bitmap indicating which objects are reachable from
+// that commit.
+type BitmapCommit struct {
+ OID string
+ *Bitmap
+ xorOffset byte
+ flags byte
+}
+
+// LoadBitmap opens the .bitmap file corresponding to idx and loads it
+// into memory. Returns an error if there is no .bitmap.
+func (idx *Index) LoadBitmap() error {
+ if idx.IndexBitmap != nil {
+ return nil
+ }
+
+ f, err := os.Open(idx.packBase + ".bitmap")
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ r := bufio.NewReader(gitio.NewHashfileReader(f))
+
+ ib := &IndexBitmap{}
+ if err := ib.parseIndexBitmapHeader(r, idx); err != nil {
+ return err
+ }
+
+ for _, ptr := range []**Bitmap{&ib.Commits, &ib.Trees, &ib.Blobs, &ib.Tags} {
+ *ptr, err = ReadEWAH(r)
+ if err != nil {
+ return err
+ }
+
+ if err := (*ptr).Unpack(); err != nil {
+ return err
+ }
+ }
+
+ for i := range ib.bitmapCommits {
+ header, err := readN(r, 6)
+ if err != nil {
+ return err
+ }
+
+ bc := &BitmapCommit{
+ OID: idx.Objects[binary.BigEndian.Uint32(header[:4])].OID,
+ xorOffset: header[4],
+ flags: header[5],
+ }
+
+ if bc.Bitmap, err = ReadEWAH(r); err != nil {
+ return err
+ }
+
+ ib.bitmapCommits[i] = bc
+ }
+
+ if ib.flags&bitmapOptHashCache > 0 {
+ // Discard bitmap hash cache
+ for range idx.Objects {
+ if _, err := r.Discard(4); err != nil {
+ return err
+ }
+ }
+ }
+
+ if _, err := r.Peek(1); err != io.EOF {
+ return fmt.Errorf("expected EOF, got %v", err)
+ }
+
+ idx.IndexBitmap = ib
+ return nil
+}
+
+const (
+ bitmapOptFullDAG = 1 // BITMAP_OPT_FULL_DAG
+ bitmapOptHashCache = 4 // BITMAP_OPT_HASH_CACHE
+)
+
+func (ib *IndexBitmap) parseIndexBitmapHeader(r io.Reader, idx *Index) error {
+ const headerLen = 32
+ header, err := readN(r, headerLen)
+ if err != nil {
+ return err
+ }
+
+ const sig = "BITM\x00\x01"
+ if actualSig := string(header[:len(sig)]); actualSig != sig {
+ return fmt.Errorf("unexpected signature %q", actualSig)
+ }
+ header = header[len(sig):]
+
+ const flagLen = 2
+ ib.flags = int(binary.BigEndian.Uint16(header[:flagLen]))
+ header = header[flagLen:]
+
+ const knownFlags = bitmapOptFullDAG | bitmapOptHashCache
+ if ib.flags&^knownFlags != 0 || (ib.flags&bitmapOptFullDAG == 0) {
+ return fmt.Errorf("invalid flags %x", ib.flags)
+ }
+
+ const countLen = 4
+ count := binary.BigEndian.Uint32(header[:countLen])
+ header = header[countLen:]
+ ib.bitmapCommits = make([]*BitmapCommit, count)
+
+ if s := hex.EncodeToString(header); s != idx.ID {
+ return fmt.Errorf("unexpected pack ID in bitmap header: %s", s)
+ }
+
+ return nil
+}
+
+// NumBitmapCommits returns the number of indexed commits in the .bitmap file.
+func (ib *IndexBitmap) NumBitmapCommits() int { return len(ib.bitmapCommits) }
+
+// BitmapCommit retrieves a bitmap commit, along with its bitmap. If the
+// bitmap is XOR-compressed this will decompress it.
+func (ib *IndexBitmap) BitmapCommit(i int) (*BitmapCommit, error) {
+ if i >= ib.NumBitmapCommits() {
+ return nil, fmt.Errorf("bitmap commit index %d out of range", i)
+ }
+
+ // This is wasteful but correct: bitmap commit i may depend, via XOR, on
+ // a chain of preceding commits j_0,..., j_m < i. Instead of finding that
+ // chain we just build and XOR all commits up to and including i.
+ for j, bc := range ib.bitmapCommits[:i+1] {
+ if bc.Bitmap.bm != nil {
+ continue
+ }
+
+ if err := bc.Bitmap.Unpack(); err != nil {
+ return nil, err
+ }
+
+ if k := int(bc.xorOffset); k > 0 {
+ bm := bc.Bitmap.bm
+ bm.Xor(bm, ib.bitmapCommits[j-k].Bitmap.bm)
+ }
+ }
+
+ return ib.bitmapCommits[i], nil
+}
+
+// Bitmap represents a bitmap as used in a packfile .bitmap file.
+type Bitmap struct {
+ bits int
+ words int
+ raw []byte
+ bm *big.Int
+}
+
+// ReadEWAH parses an EWAH-compressed bitmap into a *Bitmap.
+func ReadEWAH(r io.Reader) (*Bitmap, error) {
+ header := make([]byte, 8)
+ if _, err := io.ReadFull(r, header); err != nil {
+ return nil, err
+ }
+
+ e := &Bitmap{}
+
+ uBits := binary.BigEndian.Uint32(header[:4])
+ if uBits > math.MaxInt32 {
+ return nil, fmt.Errorf("too many bits in bitmap: %d", uBits)
+ }
+ e.bits = int(uBits)
+
+ uWords := binary.BigEndian.Uint32(header[4:])
+ if uWords > math.MaxInt32 {
+ return nil, fmt.Errorf("too many words in bitmap: %d", uWords)
+ }
+ e.words = int(uWords)
+
+ const ewahTrailerLen = 4
+ rawSize := int64(e.words)*8 + ewahTrailerLen
+ if rawSize > math.MaxInt32 {
+ return nil, fmt.Errorf("bitmap does not fit in Go slice")
+ }
+
+ e.raw = make([]byte, int(rawSize))
+
+ if _, err := io.ReadFull(r, e.raw); err != nil {
+ return nil, err
+ }
+
+ return e, nil
+}
+
+// Unpack expands e.raw, which is EWAH-compressed, into an uncompressed *big.Int.
+func (e *Bitmap) Unpack() error {
+ if e.bm != nil {
+ return nil
+ }
+
+ const (
+ wordSize = 8
+ wordBits = 8 * wordSize
+ )
+
+ nUnpackedWords := e.bits / wordBits
+ if e.bits%wordBits > 0 {
+ nUnpackedWords++
+ }
+
+ buf := make([]byte, nUnpackedWords*wordSize)
+ bufPos := len(buf)
+
+ fillOnes := bytes.Repeat([]byte{0xff}, wordSize)
+
+ for i := 0; i < e.words; {
+ header := binary.BigEndian.Uint64(e.raw[wordSize*i : wordSize*(i+1)])
+ i++
+
+ cleanBit := int(header & 1)
+ nClean := uint32(header >> 1)
+ nDirty := uint32(header >> 33)
+
+ for ; nClean > 0; nClean-- {
+ // If cleanBit == 0 we don't have to do anything, because each byte in
+ // buf is initially zero.
+ if cleanBit == 1 {
+ copy(
+ buf[bufPos-wordSize:bufPos],
+ fillOnes,
+ )
+ }
+
+ bufPos -= wordSize
+ }
+
+ for ; nDirty > 0; nDirty-- {
+ copy(
+ buf[bufPos-wordSize:bufPos],
+ e.raw[wordSize*i:wordSize*(i+1)],
+ )
+ bufPos -= wordSize
+ i++
+ }
+ }
+
+ e.bm = big.NewInt(0)
+ e.bm.SetBytes(buf)
+
+ return nil
+}
+
+// Scan traverses the bitmap and calls f for each bit which is 1.
+func (e *Bitmap) Scan(f func(int) error) error {
+ for i := 0; i < e.bits; i++ {
+ if e.bm.Bit(i) == 1 {
+ if err := f(i); err != nil {
+ return err
+ }
+ }
+ }
+
+ return nil
+}
diff --git a/internal/git/packfile/index.go b/internal/git/packfile/index.go
new file mode 100644
index 000000000..6597c8949
--- /dev/null
+++ b/internal/git/packfile/index.go
@@ -0,0 +1,265 @@
+package packfile
+
+import (
+ "bufio"
+ "context"
+ "crypto/sha1"
+ "encoding/binary"
+ "encoding/hex"
+ "fmt"
+ "io"
+ "math"
+ "os"
+ "os/exec"
+ "regexp"
+ "sort"
+ "strconv"
+ "strings"
+
+ "gitlab.com/gitlab-org/gitaly/internal/command"
+)
+
+const sumSize = sha1.Size
+
+var (
+ idxFileRegex = regexp.MustCompile(`\A(.*/pack-)([0-9a-f]{40})\.idx\z`)
+)
+
+// Index is an in-memory representation of a packfile .idx file.
+type Index struct {
+ // ID is the packfile ID. For pack-123abc.idx, this would be 123abc.
+ ID string
+ packBase string
+ // Objects holds the list of objects in the packfile in index order, i.e. sorted by OID
+ Objects []*Object
+ // Objects holds the list of objects in the packfile in packfile order, i.e. sorted by packfile offset
+ PackfileOrder []*Object
+ *IndexBitmap
+}
+
+// ReadIndex opens a packfile .idx file and loads its contents into
+// memory. In doing so it will also open and read small amounts of data
+// from the .pack file itself.
+func ReadIndex(idxPath string) (*Index, error) {
+ reMatches := idxFileRegex.FindStringSubmatch(idxPath)
+ if len(reMatches) == 0 {
+ return nil, fmt.Errorf("invalid idx filename: %q", idxPath)
+ }
+
+ idx := &Index{
+ packBase: reMatches[1] + reMatches[2],
+ ID: reMatches[2],
+ }
+
+ f, err := os.Open(idx.packBase + ".idx")
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ if _, err := f.Seek(-2*sumSize, io.SeekEnd); err != nil {
+ return nil, err
+ }
+
+ packID, err := readN(f, sumSize)
+ if err != nil {
+ return nil, err
+ }
+
+ if actual := hex.EncodeToString(packID); idx.ID != actual {
+ return nil, fmt.Errorf("expected idx to go with pack %s, got %s", idx.ID, actual)
+ }
+
+ count, err := idx.numPackObjects()
+ if err != nil {
+ return nil, err
+ }
+
+ // TODO use a data structure other than a Go slice to hold the index
+ // entries? Go slices use int as their index type, and int may not be
+ // able to hold MaxUint32.
+ if count > math.MaxInt32 {
+ return nil, fmt.Errorf("too many objects in to fit in Go slice: %d", count)
+ }
+ idx.Objects = make([]*Object, count)
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ if _, err := f.Seek(0, io.SeekStart); err != nil {
+ return nil, err
+ }
+
+ showIndex, err := command.New(ctx, exec.Command("git", "show-index"), f, nil, nil)
+ if err != nil {
+ return nil, err
+ }
+
+ scanner := bufio.NewScanner(showIndex)
+ i := 0
+ for ; scanner.Scan(); i++ {
+ line := scanner.Text()
+ split := strings.SplitN(line, " ", 3)
+ if len(split) != 3 {
+ return nil, fmt.Errorf("unable to parse show-index line: %q", line)
+ }
+
+ offset, err := strconv.ParseUint(split[0], 10, 64)
+ if err != nil {
+ return nil, err
+ }
+ oid := split[1]
+
+ idx.Objects[i] = &Object{OID: oid, Offset: offset}
+ }
+
+ if err := scanner.Err(); err != nil {
+ return nil, err
+ }
+
+ if err := showIndex.Wait(); err != nil {
+ return nil, err
+ }
+
+ if i != len(idx.Objects) {
+ return nil, fmt.Errorf("expected %d objects in output of git show-index, got %d", len(idx.Objects), i)
+ }
+
+ return idx, nil
+}
+
+func (idx *Index) numPackObjects() (uint32, error) {
+ f, err := idx.openPack()
+ if err != nil {
+ return 0, err
+ }
+ defer f.Close()
+
+ const sizeOffset = 8
+ if _, err := f.Seek(sizeOffset, io.SeekStart); err != nil {
+ return 0, err
+ }
+
+ return readUint32(f)
+}
+
+func (idx *Index) openPack() (f *os.File, err error) {
+ packPath := idx.packBase + ".pack"
+ f, err = os.Open(packPath)
+ if err != nil {
+ return nil, err
+ }
+
+ defer func(f *os.File) {
+ if err != nil {
+ f.Close()
+ }
+ }(f) // Bind f early so that we can do "return nil, err".
+
+ const headerLen = 8
+ header, err := readN(f, headerLen)
+ if err != nil {
+ return nil, err
+ }
+
+ const sig = "PACK\x00\x00\x00\x02"
+ if s := string(header); s != sig {
+ return nil, fmt.Errorf("unexpected pack signature %q", s)
+ }
+
+ if _, err := f.Seek(-sumSize, io.SeekEnd); err != nil {
+ return nil, err
+ }
+
+ sum, err := readN(f, sumSize)
+ if err != nil {
+ return nil, err
+ }
+
+ if s := hex.EncodeToString(sum); s != idx.ID {
+ return nil, fmt.Errorf("unexpected trailing checksum in .pack: %s", s)
+ }
+
+ if _, err := f.Seek(0, io.SeekStart); err != nil {
+ return nil, err
+ }
+
+ return f, nil
+}
+
+func readUint32(r io.Reader) (uint32, error) {
+ buf, err := readN(r, 4)
+ if err != nil {
+ return 0, err
+ }
+
+ return binary.BigEndian.Uint32(buf), nil
+}
+
+func readN(r io.Reader, n int) ([]byte, error) {
+ buf := make([]byte, n)
+ if _, err := io.ReadFull(r, buf); err != nil {
+ return nil, err
+ }
+
+ return buf, nil
+}
+
+// BuildPackfileOrder populates the PackfileOrder field.
+func (idx *Index) BuildPackfileOrder() {
+ if len(idx.PackfileOrder) > 0 {
+ return
+ }
+
+ idx.PackfileOrder = make([]*Object, len(idx.Objects))
+ copy(idx.PackfileOrder, idx.Objects)
+ sort.Sort(offsetOrder(idx.PackfileOrder))
+}
+
+type offsetOrder []*Object
+
+func (oo offsetOrder) Len() int { return len(oo) }
+func (oo offsetOrder) Less(i, j int) bool { return oo[i].Offset < oo[j].Offset }
+func (oo offsetOrder) Swap(i, j int) { oo[i], oo[j] = oo[j], oo[i] }
+
+// LabelObjectTypes tries to label each object in the index with its
+// object type, using the packfile bitmap. Returns an error if there is
+// no packfile .bitmap file.
+func (idx *Index) LabelObjectTypes() error {
+ if err := idx.LoadBitmap(); err != nil {
+ return err
+ }
+
+ idx.BuildPackfileOrder()
+
+ for _, t := range []struct {
+ objectType ObjectType
+ bmp *Bitmap
+ }{
+ {TCommit, idx.IndexBitmap.Commits},
+ {TTree, idx.IndexBitmap.Trees},
+ {TBlob, idx.IndexBitmap.Blobs},
+ {TTag, idx.IndexBitmap.Tags},
+ } {
+ if err := t.bmp.Scan(func(i int) error {
+ obj := idx.PackfileOrder[i]
+ if obj.Type != TUnknown {
+ return fmt.Errorf("type already set for object %v", obj)
+ }
+
+ obj.Type = t.objectType
+
+ return nil
+ }); err != nil {
+ return err
+ }
+ }
+
+ for _, obj := range idx.PackfileOrder {
+ if obj.Type == TUnknown {
+ return fmt.Errorf("object missing type label: %v", obj)
+ }
+ }
+
+ return nil
+}
diff --git a/internal/git/packfile/object.go b/internal/git/packfile/object.go
new file mode 100644
index 000000000..b9598bdb6
--- /dev/null
+++ b/internal/git/packfile/object.go
@@ -0,0 +1,42 @@
+package packfile
+
+import "fmt"
+
+// ObjectType is used to label index entries as commits, trees, blobs or tags.
+type ObjectType byte
+
+const (
+ // TUnknown is a sentinel indicating an object has not been labeled yet
+ TUnknown ObjectType = iota
+ // TBlob means Git blob
+ TBlob
+ // TCommit means Git commit
+ TCommit
+ // TTree means Git tree
+ TTree
+ // TTag means Git tag
+ TTag
+)
+
+// Object represents a Git packfile index entry, optionally decorated with its object type.
+type Object struct {
+ OID string
+ Type ObjectType
+ Offset uint64
+}
+
+func (o Object) String() string {
+ t := "unknown"
+ switch o.Type {
+ case TBlob:
+ t = "blob"
+ case TCommit:
+ t = "commit"
+ case TTree:
+ t = "tree"
+ case TTag:
+ t = "tag"
+ }
+
+ return fmt.Sprintf("%s %s\t%d", o.OID, t, o.Offset)
+}