Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSami Hiltunen <shiltunen@gitlab.com>2023-06-27 17:57:52 +0300
committerSami Hiltunen <shiltunen@gitlab.com>2023-06-29 15:58:01 +0300
commit630f75f3461a783286df933da4853ec6d057e0d1 (patch)
treec42779c3666d2c2f70d22d506fb3237b4ee9907c
parent4cfc2985cb5dab908bcf326ed0e6d84d36915f60 (diff)
Implement tooling for importing a git repo to Neo4Jsmh-repo-graph-db
This implements tooling to import a Git repository into Neo4J in order to prototype how a query language would work with Git data and how we'd model the data.
-rw-r--r--cmd/repo-graph/.gitignore1
-rw-r--r--cmd/repo-graph/README.md4
-rwxr-xr-xcmd/repo-graph/launch_db.sh2
-rw-r--r--cmd/repo-graph/load_csv.cypher68
-rw-r--r--cmd/repo-graph/main.go231
5 files changed, 306 insertions, 0 deletions
diff --git a/cmd/repo-graph/.gitignore b/cmd/repo-graph/.gitignore
new file mode 100644
index 000000000..afed0735d
--- /dev/null
+++ b/cmd/repo-graph/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/cmd/repo-graph/README.md b/cmd/repo-graph/README.md
new file mode 100644
index 000000000..530c8b6b5
--- /dev/null
+++ b/cmd/repo-graph/README.md
@@ -0,0 +1,4 @@
+1. `cd` into the `repo-graph` directory.
+1. Generate the *.csv files to import into the database by running `go run main.go <path-to-.git-dir>`
+1. Run `./launch_db.sh` to launch the database. Web console is available at `localhost:7474`.
+1. Run the query from `load_csv.cypher` in the web console to load the data.
diff --git a/cmd/repo-graph/launch_db.sh b/cmd/repo-graph/launch_db.sh
new file mode 100755
index 000000000..1036224bf
--- /dev/null
+++ b/cmd/repo-graph/launch_db.sh
@@ -0,0 +1,2 @@
+#! /bin/sh
+podman run --publish=7474:7474 --publish=7687:7687 --env=NEO4J_AUTH=none --env=NEO4J_db_import_csv_buffer__size=75000000 --volume=$(PWD):/import --rm --env=NEO4J_PLUGINS=\[\"apoc\"\] --env=NEO4J_server_memory_heap_max__size=4G neo4j
diff --git a/cmd/repo-graph/load_csv.cypher b/cmd/repo-graph/load_csv.cypher
new file mode 100644
index 000000000..1950ffe65
--- /dev/null
+++ b/cmd/repo-graph/load_csv.cypher
@@ -0,0 +1,68 @@
+:auto MATCH (n)
+CALL {
+ WITH n
+ DETACH DELETE n
+} IN TRANSACTIONS;
+
+CREATE CONSTRAINT IF NOT EXISTS FOR (o:Object) REQUIRE o.object_id IS UNIQUE;
+CREATE CONSTRAINT IF NOT EXISTS FOR (o:Blob) REQUIRE o.object_id IS UNIQUE;
+CREATE CONSTRAINT IF NOT EXISTS FOR (o:Tree) REQUIRE o.object_id IS UNIQUE;
+CREATE CONSTRAINT IF NOT EXISTS FOR (o:Commit) REQUIRE o.object_id IS UNIQUE;
+CREATE CONSTRAINT IF NOT EXISTS FOR (o:Tag) REQUIRE o.object_id IS UNIQUE;
+CREATE CONSTRAINT IF NOT EXISTS FOR (r:Reference) REQUIRE r.name IS UNIQUE;
+
+:auto LOAD CSV WITH HEADERS FROM "file:///blob.csv" AS obj
+CALL {
+ WITH obj
+ CREATE (o:Object:Blob{object_id: obj.object_id, content: apoc.text.base64Decode(coalesce(obj.content, ''))})
+} IN TRANSACTIONS;
+
+:auto LOAD CSV WITH HEADERS FROM "file:///tree.csv" AS obj
+CALL {
+ WITH obj
+ CREATE (o:Object:Tree{object_id: obj.object_id})
+} IN TRANSACTIONS;
+
+:auto LOAD CSV WITH HEADERS FROM "file:///tag.csv" AS obj
+CALL {
+ WITH obj
+ CREATE (o:Object:Tag{object_id: obj.object_id, content: apoc.text.base64Decode(coalesce(obj.content, ''))})
+} IN TRANSACTIONS;
+
+:auto LOAD CSV WITH HEADERS FROM "file:///commit.csv" AS obj
+CALL {
+ WITH obj
+ MATCH (t:Tree{object_id:obj.tree})
+ CREATE (o:Object:Commit{
+ object_id: obj.object_id,
+ subject: apoc.text.base64Decode(coalesce(obj.subject, '')),
+ message: apoc.text.base64Decode(coalesce(obj.message, '')),
+ author_name: apoc.text.base64Decode(coalesce(obj.author_name, '')),
+ author_email: apoc.text.base64Decode(coalesce(obj.author_email, '')),
+ author_date: datetime({epochSeconds: toInteger(obj.author_date_epoch), timezone: obj.author_date_tz}),
+ committer_name: apoc.text.base64Decode(coalesce(obj.committer_name, '')),
+ committer_email: apoc.text.base64Decode(coalesce(obj.committer_email, '')),
+ committer_date: datetime({epochSeconds: toInteger(obj.committer_date_epoch), timezone: obj.committer_date_tz})
+ })-[:TREE]->(t)
+} IN TRANSACTIONS;
+
+:auto LOAD CSV WITH HEADERS FROM "file:///commit_parents.csv" AS obj
+CALL {
+ WITH obj
+ MATCH (c:Commit{object_id:obj.commit_oid}), (p:Commit{object_id:obj.parent_oid})
+ CREATE (c)-[:PARENT{ordinal:obj.ordinal}]->(p)
+} IN TRANSACTIONS;
+
+:auto LOAD CSV WITH HEADERS FROM "file:///tree_entries.csv" AS obj
+CALL {
+ WITH obj
+ MATCH (t:Tree{object_id:obj.tree_oid}), (o:Object{object_id:obj.entry_oid})
+ CREATE (t)-[:ENTRY{path: apoc.text.base64Decode(obj.path), mode: obj.mode}]->(o)
+} IN TRANSACTIONS;
+
+:auto LOAD CSV WITH HEADERS FROM "file:///references.csv" AS ref
+CALL {
+ WITH ref
+ MATCH (o:Object{object_id:ref.object_id})
+ CREATE (r:Reference{name:ref.reference})-[:REFERENCES]->(o)
+} IN TRANSACTIONS;
diff --git a/cmd/repo-graph/main.go b/cmd/repo-graph/main.go
new file mode 100644
index 000000000..d593a51dd
--- /dev/null
+++ b/cmd/repo-graph/main.go
@@ -0,0 +1,231 @@
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "crypto/sha1"
+ "encoding/base64"
+ "encoding/hex"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "log"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strings"
+
+ "gitlab.com/gitlab-org/gitaly/v16/internal/git"
+ "gitlab.com/gitlab-org/gitaly/v16/internal/git/catfile"
+ "gitlab.com/gitlab-org/gitaly/v16/internal/git/localrepo"
+ "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config"
+ "gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb"
+)
+
+func main() {
+ if err := Main(); err != nil {
+ log.Fatalf("error: %q", err)
+ }
+}
+
+func Main() error {
+ cmd := exec.Command("git", "for-each-ref", "--format", "%(refname)%00%(objectname)")
+
+ refList, err := cmd.CombinedOutput()
+ if err != nil {
+ return fmt.Errorf("list refs: %w", err)
+ }
+
+ refs := map[string]string{}
+ for _, line := range strings.Split(string(refList), "\n") {
+ if line == "" {
+ break
+ }
+
+ components := strings.Split(line, "\x00")
+ refs[components[0]] = components[1]
+ }
+
+ cmd = exec.Command("git", "cat-file", "--batch-check=%(objectname) %(objecttype)", "--batch-all-objects")
+ objectList, err := cmd.CombinedOutput()
+ if err != nil {
+ return fmt.Errorf("list objects: %w", err)
+ }
+
+ objects := map[string]string{}
+ for _, line := range strings.Split(string(objectList), "\n") {
+ if line == "" {
+ break
+ }
+
+ components := strings.Split(line, " ")
+ objects[components[0]] = components[1]
+ }
+
+ refCSV, err := os.OpenFile("references.csv", os.O_TRUNC|os.O_CREATE|os.O_WRONLY, os.ModePerm)
+ if err != nil {
+ return fmt.Errorf("open reference.csv: %w", err)
+ }
+
+ fmt.Fprintf(refCSV, "reference,object_id\n")
+ for reference, objectID := range refs {
+ fmt.Fprintf(refCSV, "%s,%s\n", reference, objectID)
+ }
+
+ repoPath := os.Args[1]
+ cfg := config.Cfg{
+ Storages: []config.Storage{
+ {Name: "default", Path: filepath.Dir(repoPath)},
+ },
+ }
+
+ repoPb := &gitalypb.Repository{
+ StorageName: "default",
+ RelativePath: filepath.Base(repoPath),
+ }
+
+ cache := catfile.NewCache(cfg)
+
+ cmdFactory, cleanUp, err := git.NewExecCommandFactory(cfg, git.WithSkipHooks())
+ if err != nil {
+ return fmt.Errorf("new exec command factory: %w", err)
+ }
+ defer cleanUp()
+
+ repo := localrepo.New(config.NewLocator(cfg), cmdFactory, cache, repoPb)
+
+ ctx := context.Background()
+ objectReader, cleanReader, err := cache.ObjectReader(ctx, repo)
+ if err != nil {
+ return fmt.Errorf("object reader: %w", err)
+ }
+ defer cleanReader()
+
+ contentQueue, cleanQueue, err := objectReader.ObjectContentQueue(ctx)
+ if err != nil {
+ return fmt.Errorf("clean queue: %w", err)
+ }
+ defer cleanQueue()
+
+ blobCSV := openCSV("blob.csv", "object_id", "content")
+ defer blobCSV.Close()
+ tagCSV := openCSV("tag.csv", "object_id", "content")
+ defer tagCSV.Close()
+ commitCSV := openCSV("commit.csv", "object_id", "subject", "message", "tree", "author_name", "author_email", "author_date_epoch", "author_date_tz", "committer_name", "committer_email", "committer_date_epoch", "committer_date_tz")
+ defer commitCSV.Close()
+ treeCSV := openCSV("tree.csv", "object_id", "content")
+ defer treeCSV.Close()
+
+ commitParentsCSV := openCSV("commit_parents.csv", "commit_oid", "parent_oid", "ordinal")
+ defer commitParentsCSV.Close()
+
+ treeEntriesCSV := openCSV("tree_entries.csv", "tree_oid", "mode", "entry_oid", "path")
+ defer treeEntriesCSV.Close()
+
+ for objectID, objType := range objects {
+ if err := contentQueue.RequestObject(ctx, git.Revision(objectID)); err != nil {
+ panic(fmt.Errorf("request object: %w", err))
+ }
+ contentQueue.Flush(ctx)
+
+ object, err := contentQueue.ReadObject(ctx)
+ if err != nil {
+ return fmt.Errorf("read object: %w", err)
+ }
+
+ if object.Oid != git.ObjectID(objectID) {
+ return fmt.Errorf("unexpected id")
+ }
+
+ switch objType {
+ case "commit":
+ commit, err := catfile.NewParser().ParseCommit(object)
+ if err != nil {
+ return fmt.Errorf("parse commit: %w", err)
+ }
+
+ fmt.Fprintf(commitCSV, "%s,%s,%s,%s,%s,%s,%d,%s,%s,%s,%d,%s\n",
+ objectID,
+ base64.StdEncoding.EncodeToString(commit.Subject),
+ base64.StdEncoding.EncodeToString(commit.Body),
+ commit.TreeId,
+ base64.StdEncoding.EncodeToString(commit.Author.Name),
+ base64.StdEncoding.EncodeToString(commit.Author.Email),
+ commit.Author.Date.Seconds,
+ fmt.Sprintf("%s:%s", commit.Author.Timezone[:3], commit.Author.Timezone[3:]),
+ base64.StdEncoding.EncodeToString(commit.Committer.Name),
+ base64.StdEncoding.EncodeToString(commit.Committer.Email),
+ commit.Committer.Date.Seconds,
+ fmt.Sprintf("%s:%s", commit.Committer.Timezone[:3], commit.Committer.Timezone[3:]),
+ )
+
+ for i, parentOID := range commit.ParentIds {
+ fmt.Fprintf(commitParentsCSV, "%s,%s,%v\n",
+ objectID,
+ parentOID,
+ i,
+ )
+ }
+ case "tree":
+ content, err := ioutil.ReadAll(object)
+ if err != nil {
+ return fmt.Errorf("read tree: %w", err)
+ }
+
+ fmt.Fprintf(treeCSV, "%s,%s\n", objectID, base64.StdEncoding.EncodeToString(content))
+
+ reader := bufio.NewReader(bytes.NewReader(content))
+ for {
+ modeAndPath, err := reader.ReadString(byte('\x00'))
+ if err != nil {
+ if err == io.EOF {
+ break
+ }
+
+ return fmt.Errorf("read mode and path: %q", err)
+ }
+
+ components := strings.Split(modeAndPath, " ")
+ mode, path := components[0], strings.TrimRight(components[1], "\x00")
+
+ oidBytes := make([]byte, sha1.Size)
+ if _, err := reader.Read(oidBytes); err != nil {
+ return fmt.Errorf("read oid: %q", err)
+ }
+
+ hexOID := hex.EncodeToString(oidBytes)
+ fmt.Fprintf(treeEntriesCSV, "%s,%s,%s,%s\n", objectID, mode, hexOID, base64.StdEncoding.EncodeToString([]byte(path)))
+ }
+ case "tag":
+ content, err := ioutil.ReadAll(object)
+ if err != nil {
+ return fmt.Errorf("read all: %w", err)
+ }
+
+ fmt.Fprintf(tagCSV, "%s,%s\n", objectID, base64.StdEncoding.EncodeToString(content))
+ case "blob":
+ content, err := ioutil.ReadAll(object)
+ if err != nil {
+ return fmt.Errorf("read all: %w", err)
+ }
+
+ fmt.Fprintf(blobCSV, "%s,%s\n", objectID, base64.StdEncoding.EncodeToString(content))
+ }
+ }
+
+ return nil
+}
+
+func openCSV(name string, headers ...string) io.WriteCloser {
+ file, err := os.OpenFile(name, os.O_TRUNC|os.O_CREATE|os.O_WRONLY, os.ModePerm)
+ if err != nil {
+ panic(fmt.Errorf("open %q: %w", name, err))
+ }
+
+ if _, err := fmt.Fprintf(file, "%s\n", strings.Join(headers, ",")); err != nil {
+ panic(fmt.Errorf("write headers: %w", err))
+ }
+
+ return file
+}