diff options
author | Sami Hiltunen <shiltunen@gitlab.com> | 2023-06-27 17:57:52 +0300 |
---|---|---|
committer | Sami Hiltunen <shiltunen@gitlab.com> | 2023-06-29 15:58:01 +0300 |
commit | 630f75f3461a783286df933da4853ec6d057e0d1 (patch) | |
tree | c42779c3666d2c2f70d22d506fb3237b4ee9907c | |
parent | 4cfc2985cb5dab908bcf326ed0e6d84d36915f60 (diff) |
Implement tooling for importing a git repo to Neo4Jsmh-repo-graph-db
This implements tooling to import a Git repository into Neo4J in
order to prototype how a query language would work with Git data and
how we'd model the data.
-rw-r--r-- | cmd/repo-graph/.gitignore | 1 | ||||
-rw-r--r-- | cmd/repo-graph/README.md | 4 | ||||
-rwxr-xr-x | cmd/repo-graph/launch_db.sh | 2 | ||||
-rw-r--r-- | cmd/repo-graph/load_csv.cypher | 68 | ||||
-rw-r--r-- | cmd/repo-graph/main.go | 231 |
5 files changed, 306 insertions, 0 deletions
diff --git a/cmd/repo-graph/.gitignore b/cmd/repo-graph/.gitignore new file mode 100644 index 000000000..afed0735d --- /dev/null +++ b/cmd/repo-graph/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/cmd/repo-graph/README.md b/cmd/repo-graph/README.md new file mode 100644 index 000000000..530c8b6b5 --- /dev/null +++ b/cmd/repo-graph/README.md @@ -0,0 +1,4 @@ +1. `cd` into the `repo-graph` directory. +1. Generate the *.csv files to import into the database by running `go run main.go <path-to-.git-dir>` +1. Run `./launch_db.sh` to launch the database. Web console is available at `localhost:7474`. +1. Run the query from `load_csv.cypher` in the web console to load the data. diff --git a/cmd/repo-graph/launch_db.sh b/cmd/repo-graph/launch_db.sh new file mode 100755 index 000000000..1036224bf --- /dev/null +++ b/cmd/repo-graph/launch_db.sh @@ -0,0 +1,2 @@ +#! /bin/sh +podman run --publish=7474:7474 --publish=7687:7687 --env=NEO4J_AUTH=none --env=NEO4J_db_import_csv_buffer__size=75000000 --volume=$(PWD):/import --rm --env=NEO4J_PLUGINS=\[\"apoc\"\] --env=NEO4J_server_memory_heap_max__size=4G neo4j diff --git a/cmd/repo-graph/load_csv.cypher b/cmd/repo-graph/load_csv.cypher new file mode 100644 index 000000000..1950ffe65 --- /dev/null +++ b/cmd/repo-graph/load_csv.cypher @@ -0,0 +1,68 @@ +:auto MATCH (n) +CALL { + WITH n + DETACH DELETE n +} IN TRANSACTIONS; + +CREATE CONSTRAINT IF NOT EXISTS FOR (o:Object) REQUIRE o.object_id IS UNIQUE; +CREATE CONSTRAINT IF NOT EXISTS FOR (o:Blob) REQUIRE o.object_id IS UNIQUE; +CREATE CONSTRAINT IF NOT EXISTS FOR (o:Tree) REQUIRE o.object_id IS UNIQUE; +CREATE CONSTRAINT IF NOT EXISTS FOR (o:Commit) REQUIRE o.object_id IS UNIQUE; +CREATE CONSTRAINT IF NOT EXISTS FOR (o:Tag) REQUIRE o.object_id IS UNIQUE; +CREATE CONSTRAINT IF NOT EXISTS FOR (r:Reference) REQUIRE r.name IS UNIQUE; + +:auto LOAD CSV WITH HEADERS FROM "file:///blob.csv" AS obj +CALL { + WITH obj + CREATE (o:Object:Blob{object_id: obj.object_id, content: apoc.text.base64Decode(coalesce(obj.content, ''))}) +} IN TRANSACTIONS; + +:auto LOAD CSV WITH HEADERS FROM "file:///tree.csv" AS obj +CALL { + WITH obj + CREATE (o:Object:Tree{object_id: obj.object_id}) +} IN TRANSACTIONS; + +:auto LOAD CSV WITH HEADERS FROM "file:///tag.csv" AS obj +CALL { + WITH obj + CREATE (o:Object:Tag{object_id: obj.object_id, content: apoc.text.base64Decode(coalesce(obj.content, ''))}) +} IN TRANSACTIONS; + +:auto LOAD CSV WITH HEADERS FROM "file:///commit.csv" AS obj +CALL { + WITH obj + MATCH (t:Tree{object_id:obj.tree}) + CREATE (o:Object:Commit{ + object_id: obj.object_id, + subject: apoc.text.base64Decode(coalesce(obj.subject, '')), + message: apoc.text.base64Decode(coalesce(obj.message, '')), + author_name: apoc.text.base64Decode(coalesce(obj.author_name, '')), + author_email: apoc.text.base64Decode(coalesce(obj.author_email, '')), + author_date: datetime({epochSeconds: toInteger(obj.author_date_epoch), timezone: obj.author_date_tz}), + committer_name: apoc.text.base64Decode(coalesce(obj.committer_name, '')), + committer_email: apoc.text.base64Decode(coalesce(obj.committer_email, '')), + committer_date: datetime({epochSeconds: toInteger(obj.committer_date_epoch), timezone: obj.committer_date_tz}) + })-[:TREE]->(t) +} IN TRANSACTIONS; + +:auto LOAD CSV WITH HEADERS FROM "file:///commit_parents.csv" AS obj +CALL { + WITH obj + MATCH (c:Commit{object_id:obj.commit_oid}), (p:Commit{object_id:obj.parent_oid}) + CREATE (c)-[:PARENT{ordinal:obj.ordinal}]->(p) +} IN TRANSACTIONS; + +:auto LOAD CSV WITH HEADERS FROM "file:///tree_entries.csv" AS obj +CALL { + WITH obj + MATCH (t:Tree{object_id:obj.tree_oid}), (o:Object{object_id:obj.entry_oid}) + CREATE (t)-[:ENTRY{path: apoc.text.base64Decode(obj.path), mode: obj.mode}]->(o) +} IN TRANSACTIONS; + +:auto LOAD CSV WITH HEADERS FROM "file:///references.csv" AS ref +CALL { + WITH ref + MATCH (o:Object{object_id:ref.object_id}) + CREATE (r:Reference{name:ref.reference})-[:REFERENCES]->(o) +} IN TRANSACTIONS; diff --git a/cmd/repo-graph/main.go b/cmd/repo-graph/main.go new file mode 100644 index 000000000..d593a51dd --- /dev/null +++ b/cmd/repo-graph/main.go @@ -0,0 +1,231 @@ +package main + +import ( + "bufio" + "bytes" + "context" + "crypto/sha1" + "encoding/base64" + "encoding/hex" + "fmt" + "io" + "io/ioutil" + "log" + "os" + "os/exec" + "path/filepath" + "strings" + + "gitlab.com/gitlab-org/gitaly/v16/internal/git" + "gitlab.com/gitlab-org/gitaly/v16/internal/git/catfile" + "gitlab.com/gitlab-org/gitaly/v16/internal/git/localrepo" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config" + "gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb" +) + +func main() { + if err := Main(); err != nil { + log.Fatalf("error: %q", err) + } +} + +func Main() error { + cmd := exec.Command("git", "for-each-ref", "--format", "%(refname)%00%(objectname)") + + refList, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("list refs: %w", err) + } + + refs := map[string]string{} + for _, line := range strings.Split(string(refList), "\n") { + if line == "" { + break + } + + components := strings.Split(line, "\x00") + refs[components[0]] = components[1] + } + + cmd = exec.Command("git", "cat-file", "--batch-check=%(objectname) %(objecttype)", "--batch-all-objects") + objectList, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("list objects: %w", err) + } + + objects := map[string]string{} + for _, line := range strings.Split(string(objectList), "\n") { + if line == "" { + break + } + + components := strings.Split(line, " ") + objects[components[0]] = components[1] + } + + refCSV, err := os.OpenFile("references.csv", os.O_TRUNC|os.O_CREATE|os.O_WRONLY, os.ModePerm) + if err != nil { + return fmt.Errorf("open reference.csv: %w", err) + } + + fmt.Fprintf(refCSV, "reference,object_id\n") + for reference, objectID := range refs { + fmt.Fprintf(refCSV, "%s,%s\n", reference, objectID) + } + + repoPath := os.Args[1] + cfg := config.Cfg{ + Storages: []config.Storage{ + {Name: "default", Path: filepath.Dir(repoPath)}, + }, + } + + repoPb := &gitalypb.Repository{ + StorageName: "default", + RelativePath: filepath.Base(repoPath), + } + + cache := catfile.NewCache(cfg) + + cmdFactory, cleanUp, err := git.NewExecCommandFactory(cfg, git.WithSkipHooks()) + if err != nil { + return fmt.Errorf("new exec command factory: %w", err) + } + defer cleanUp() + + repo := localrepo.New(config.NewLocator(cfg), cmdFactory, cache, repoPb) + + ctx := context.Background() + objectReader, cleanReader, err := cache.ObjectReader(ctx, repo) + if err != nil { + return fmt.Errorf("object reader: %w", err) + } + defer cleanReader() + + contentQueue, cleanQueue, err := objectReader.ObjectContentQueue(ctx) + if err != nil { + return fmt.Errorf("clean queue: %w", err) + } + defer cleanQueue() + + blobCSV := openCSV("blob.csv", "object_id", "content") + defer blobCSV.Close() + tagCSV := openCSV("tag.csv", "object_id", "content") + defer tagCSV.Close() + commitCSV := openCSV("commit.csv", "object_id", "subject", "message", "tree", "author_name", "author_email", "author_date_epoch", "author_date_tz", "committer_name", "committer_email", "committer_date_epoch", "committer_date_tz") + defer commitCSV.Close() + treeCSV := openCSV("tree.csv", "object_id", "content") + defer treeCSV.Close() + + commitParentsCSV := openCSV("commit_parents.csv", "commit_oid", "parent_oid", "ordinal") + defer commitParentsCSV.Close() + + treeEntriesCSV := openCSV("tree_entries.csv", "tree_oid", "mode", "entry_oid", "path") + defer treeEntriesCSV.Close() + + for objectID, objType := range objects { + if err := contentQueue.RequestObject(ctx, git.Revision(objectID)); err != nil { + panic(fmt.Errorf("request object: %w", err)) + } + contentQueue.Flush(ctx) + + object, err := contentQueue.ReadObject(ctx) + if err != nil { + return fmt.Errorf("read object: %w", err) + } + + if object.Oid != git.ObjectID(objectID) { + return fmt.Errorf("unexpected id") + } + + switch objType { + case "commit": + commit, err := catfile.NewParser().ParseCommit(object) + if err != nil { + return fmt.Errorf("parse commit: %w", err) + } + + fmt.Fprintf(commitCSV, "%s,%s,%s,%s,%s,%s,%d,%s,%s,%s,%d,%s\n", + objectID, + base64.StdEncoding.EncodeToString(commit.Subject), + base64.StdEncoding.EncodeToString(commit.Body), + commit.TreeId, + base64.StdEncoding.EncodeToString(commit.Author.Name), + base64.StdEncoding.EncodeToString(commit.Author.Email), + commit.Author.Date.Seconds, + fmt.Sprintf("%s:%s", commit.Author.Timezone[:3], commit.Author.Timezone[3:]), + base64.StdEncoding.EncodeToString(commit.Committer.Name), + base64.StdEncoding.EncodeToString(commit.Committer.Email), + commit.Committer.Date.Seconds, + fmt.Sprintf("%s:%s", commit.Committer.Timezone[:3], commit.Committer.Timezone[3:]), + ) + + for i, parentOID := range commit.ParentIds { + fmt.Fprintf(commitParentsCSV, "%s,%s,%v\n", + objectID, + parentOID, + i, + ) + } + case "tree": + content, err := ioutil.ReadAll(object) + if err != nil { + return fmt.Errorf("read tree: %w", err) + } + + fmt.Fprintf(treeCSV, "%s,%s\n", objectID, base64.StdEncoding.EncodeToString(content)) + + reader := bufio.NewReader(bytes.NewReader(content)) + for { + modeAndPath, err := reader.ReadString(byte('\x00')) + if err != nil { + if err == io.EOF { + break + } + + return fmt.Errorf("read mode and path: %q", err) + } + + components := strings.Split(modeAndPath, " ") + mode, path := components[0], strings.TrimRight(components[1], "\x00") + + oidBytes := make([]byte, sha1.Size) + if _, err := reader.Read(oidBytes); err != nil { + return fmt.Errorf("read oid: %q", err) + } + + hexOID := hex.EncodeToString(oidBytes) + fmt.Fprintf(treeEntriesCSV, "%s,%s,%s,%s\n", objectID, mode, hexOID, base64.StdEncoding.EncodeToString([]byte(path))) + } + case "tag": + content, err := ioutil.ReadAll(object) + if err != nil { + return fmt.Errorf("read all: %w", err) + } + + fmt.Fprintf(tagCSV, "%s,%s\n", objectID, base64.StdEncoding.EncodeToString(content)) + case "blob": + content, err := ioutil.ReadAll(object) + if err != nil { + return fmt.Errorf("read all: %w", err) + } + + fmt.Fprintf(blobCSV, "%s,%s\n", objectID, base64.StdEncoding.EncodeToString(content)) + } + } + + return nil +} + +func openCSV(name string, headers ...string) io.WriteCloser { + file, err := os.OpenFile(name, os.O_TRUNC|os.O_CREATE|os.O_WRONLY, os.ModePerm) + if err != nil { + panic(fmt.Errorf("open %q: %w", name, err)) + } + + if _, err := fmt.Fprintf(file, "%s\n", strings.Join(headers, ",")); err != nil { + panic(fmt.Errorf("write headers: %w", err)) + } + + return file +} |