Converted fast-import to a text based protocol.

Frontend clients can now send a text stream to fast-import rather than a binary stream. This should facilitate developing frontend software as the data stream is easier to view, manipulate and debug my hand and Mark-I eyeball. Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
author: Shawn O. Pearce <spearce@spearce.org> 2006-08-15 04:16:28 +0400
committer: Shawn O. Pearce <spearce@spearce.org> 2007-01-14 10:15:04 +0300
commit: c44cdc7eef212ec09901eb2e0996476e0468ed88 (patch)
tree: ab64a49e8719583156fb7a57281313db61cc60e6 /fast-import.c
parent: 7111feede9c5905199ba48645fadc369faca5711 (diff)
1 files changed, 326 insertions, 160 deletions
diff --git a/fast-import.c b/fast-import.c
index 95b84f57e5..2953e80cde 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -4,57 +4,93 @@ Format of STDIN stream:
   stream ::= cmd*;
 
   cmd ::= new_blob
-        | new_commit
         | new_branch
+        | new_commit
         | new_tag
         ;
 
-  new_blob ::= 'blob' blob_data;
+  new_blob ::= 'blob' lf
+	mark?
+    file_content;
+  file_content ::= data;
 
-  new_commit ::= 'comt' ref_name author_committer_msg
-    file_change*
-    '0';
+  new_branch ::= 'branch' sp ref_str lf
+    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
+    lf;
 
-  new_branch ::= 'brch' dst_ref_name src_ref_name;
-  dst_ref_name ::= ref_name;
-  src_ref_name ::= ref_name | sha1_exp;
-
-  new_tag ::= 'tagg' ref_name tag_name tagger_msg;
+  new_commit ::= 'commit' sp ref_str lf
+	mark?
+	('author' sp name '<' email '>' ts tz lf)?
+	'committer' sp name '<' email '>' ts tz lf
+	commit_msg
+    file_change*
+    lf;
+  commit_msg ::= data;
 
-  file_change ::= 'M' path_name hexsha1
-                | 'D' path_name
+  file_change ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf
+                | 'D' sp path_str lf
                 ;
-
-  author_committer_msg ::= len32
-    'author' sp name '<' email '>' ts tz lf
-    'committer' sp name '<' email '>' ts tz lf
-    lf
-    binary_data;
-
-  tagger_msg ::= len32
-    'tagger' sp name '<' email '>' ts tz lf
-    lf
-    binary_data;
-
-  blob_data ::= len32 binary_data; # max len is 2^32-1
-  path_name ::= len32 path;        # max len is PATH_MAX-1
-  ref_name  ::= len32 ref;         # max len is PATH_MAX-1
-  tag_name  ::= len32 tag;         # max len is PATH_MAX-1
-  sha1_exp  ::= len32 sha1exp;     # max len is PATH_MAX-1
-
-  len32 ::= # unsigned 32 bit value, native format;
+  mode ::= '644' | '755';
+
+  new_tag ::= 'tag' sp tag_str lf
+    'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
+	'tagger' sp name '<' email '>' ts tz lf
+    tag_msg;
+  tag_msg ::= data;
+
+     # note: the first idnum in a stream should be 1 and subsequent
+     # idnums should not have gaps between values as this will cause
+     # the stream parser to reserve space for the gapped values.  An
+	 # idnum can be updated in the future to a new object by issuing
+     # a new mark directive with the old idnum.
+	 #
+  mark ::= 'mark' sp idnum lf;
+
+     # note: declen indicates the length of binary_data in bytes.
+     # declen does not include the lf preceeding or trailing the
+     # binary data.
+     #
+  data ::= 'data' sp declen lf
+    binary_data
+	lf;
+
+     # note: quoted strings are C-style quoting supporting \c for
+     # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
+	 # is the signed byte value in octal.  Note that the only
+     # characters which must actually be escaped to protect the
+     # stream formatting is: \, " and LF.  Otherwise these values
+	 # are UTF8.
+     #
+  ref_str     ::= ref     | '"' quoted(ref)     '"' ;
+  sha1exp_str ::= sha1exp | '"' quoted(sha1exp) '"' ;
+  tag_str     ::= tag     | '"' quoted(tag)     '"' ;
+  path_str    ::= path    | '"' quoted(path)    '"' ;
+
+  declen ::= # unsigned 32 bit value, ascii base10 notation;
   binary_data ::= # file content, not interpreted;
+
   sp ::= # ASCII space character;
   lf ::= # ASCII newline (LF) character;
-  path ::= # GIT style file path, e.g. "a/b/c";
-  ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
-  tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
+
+     # note: a colon (':') must precede the numerical value assigned to
+	 # an idnum.  This is to distinguish it from a ref or tag name as
+     # GIT does not permit ':' in ref or tag strings.
+	 #
+  idnum   ::= ':' declen;
+  path    ::= # GIT style file path, e.g. "a/b/c";
+  ref     ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
+  tag     ::= # GIT tag name, e.g. "FIREFOX_1_5";
   sha1exp ::= # Any valid GIT SHA1 expression;
   hexsha1 ::= # SHA1 in hexadecimal format;
-  name ::= # valid GIT author/committer name;
+
+     # note: name and email are UTF8 strings, however name must not
+	 # contain '<' or lf and email must not contain any of the
+     # following: '<', '>', lf.
+	 #
+  name  ::= # valid GIT author/committer name;
   email ::= # valid GIT author/committer email;
-  ts ::= # time since the epoch in seconds, ascii decimal;
-  tz ::= # GIT style timezone;
+  ts    ::= # time since the epoch in seconds, ascii base10 notation;
+  tz    ::= # GIT style timezone;
 */
 
 #include "builtin.h"
@@ -66,6 +102,8 @@ Format of STDIN stream:
 #include "pack.h"
 #include "refs.h"
 #include "csum-file.h"
+#include "strbuf.h"
+#include "quote.h"
 
 struct object_entry
 {
@@ -153,7 +191,7 @@ static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool);
 static size_t total_allocd;
 static struct mem_pool *mem_pool;
 
-/* atom management */
+/* Atom management */
 static unsigned int atom_table_sz = 4451;
 static unsigned int atom_cnt;
 static struct atom_str **atom_table;
@@ -184,6 +222,10 @@ static unsigned int branch_table_sz = 1039;
 static struct branch **branch_table;
 static struct branch *active_branches;
 
+/* Input stream parsing */
+static struct strbuf command_buf;
+static unsigned long command_mark;
+
 
 static void alloc_objects(int cnt)
 {
@@ -330,6 +372,8 @@ static struct branch* new_branch(const char *name)
 
 	if (b)
 		die("Invalid attempt to create duplicate branch: %s", name);
+	if (check_ref_format(name))
+		die("Branch name doesn't conform to GIT standards: %s", name);
 
 	b = pool_calloc(1, sizeof(struct branch));
 	b->name = pool_strdup(name);
@@ -433,22 +477,6 @@ static void yread(int fd, void *buffer, size_t length)
 	}
 }
 
-static int optional_read(int fd, void *buffer, size_t length)
-{
-	ssize_t ret = 0;
-	while (ret < length) {
-		ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
-		if (!size && !ret)
-			return 1;
-		if (!size)
-			die("Read from descriptor %i: end of stream", fd);
-		if (size < 0)
-			die("Read from descriptor %i: %s", fd, strerror(errno));
-		ret += size;
-	}
-	return 0;
-}
-
 static void ywrite(int fd, void *buffer, size_t length)
 {
 	ssize_t ret = 0;
@@ -462,24 +490,9 @@ static void ywrite(int fd, void *buffer, size_t length)
 	}
 }
 
-static const char* read_path()
-{
-	static char sn[PATH_MAX];
-	unsigned long slen;
-
-	yread(0, &slen, 4);
-	if (!slen)
-		die("Expected string command parameter, didn't find one");
-	if (slen > (PATH_MAX - 1))
-		die("Can't handle excessive string length %lu", slen);
-	yread(0, sn, slen);
-	sn[slen] = 0;
-	return sn;
-}
-
-static unsigned long encode_header(
+static size_t encode_header(
 	enum object_type type,
-	unsigned long size,
+	size_t size,
 	unsigned char *hdr)
 {
 	int n = 1;
@@ -503,7 +516,7 @@ static unsigned long encode_header(
 static int store_object(
 	enum object_type type,
 	void *dat,
-	unsigned long datlen,
+	size_t datlen,
 	struct last_object *last,
 	unsigned char *sha1out)
 {
@@ -896,15 +909,57 @@ static void dump_branches()
 	}
 }
 
+static void read_next_command()
+{
+	read_line(&command_buf, stdin, '\n');
+}
+
+static void cmd_mark()
+{
+	if (!strncmp("mark :", command_buf.buf, 6)) {
+		command_mark = strtoul(command_buf.buf + 6, NULL, 10);
+		read_next_command();
+	}
+	else
+		command_mark = 0;
+}
+
+static void* cmd_data (size_t *size)
+{
+	size_t n = 0;
+	void *buffer;
+	size_t length;
+
+	if (strncmp("data ", command_buf.buf, 5))
+		die("Expected 'data n' command, found: %s", command_buf.buf);
+
+	length = strtoul(command_buf.buf + 5, NULL, 10);
+	buffer = xmalloc(length);
+
+	while (n < length) {
+		size_t s = fread((char*)buffer + n, 1, length - n, stdin);
+		if (!s && feof(stdin))
+			die("EOF in data (%lu bytes remaining)", length - n);
+		n += s;
+	}
+
+	if (fgetc(stdin) != '\n')
+		die("An lf did not trail the binary data as expected.");
+
+	*size = length;
+	return buffer;
+}
+
 static void cmd_new_blob()
 {
-	unsigned long datlen;
-	unsigned char sha1[20];
+	size_t datlen;
 	void *dat;
+	unsigned char sha1[20];
+
+	read_next_command();
+	cmd_mark();
+	dat = cmd_data(&datlen);
 
-	yread(0, &datlen, 4);
-	dat = xmalloc(datlen);
-	yread(0, dat, datlen);
 	if (store_object(OBJ_BLOB, dat, datlen, &last_blob, sha1))
 		free(dat);
 }
@@ -949,122 +1004,231 @@ static void load_branch(struct branch *b)
 
 static void file_change_m(struct branch *b)
 {
-	const char *path = read_path();
+	const char *p = command_buf.buf + 2;
+	char *p_uq;
+	const char *endp;
 	struct object_entry *oe;
-	char hexsha1[41];
 	unsigned char sha1[20];
+	unsigned int mode;
 	char type[20];
 
-	yread(0, hexsha1, 40);
-	hexsha1[40] = 0;
+	p = get_mode(p, &mode);
+	if (!p)
+		die("Corrupt mode: %s", command_buf.buf);
+	switch (mode) {
+	case S_IFREG | 0644:
+	case S_IFREG | 0755:
+	case 0644:
+	case 0755:
+		/* ok */
+		break;
+	default:
+		die("Corrupt mode: %s", command_buf.buf);
+	}
+
+	if (get_sha1_hex(p, sha1))
+		die("Invalid SHA1: %s", command_buf.buf);
+	p += 40;
+	if (*p++ != ' ')
+		die("Missing space after SHA1: %s", command_buf.buf);
+
+	p_uq = unquote_c_style(p, &endp);
+	if (p_uq) {
+		if (*endp)
+			die("Garbage after path in: %s", command_buf.buf);
+		p = p_uq;
+	}
 
-	if (get_sha1_hex(hexsha1, sha1))
-		die("Invalid sha1 %s for %s", hexsha1, path);
 	oe = find_object(sha1);
 	if (oe) {
 		if (oe->type != OBJ_BLOB)
-			die("%s is a %s not a blob (for %s)", hexsha1, type_names[oe->type], path);
+			die("Not a blob (actually a %s): %s",
+				command_buf.buf, type_names[oe->type]);
 	} else {
 		if (sha1_object_info(sha1, type, NULL))
-			die("No blob %s for %s", hexsha1, path);
+			die("Blob not found: %s", command_buf.buf);
 		if (strcmp(blob_type, type))
-			die("%s is a %s not a blob (for %s)", hexsha1, type, path);
+			die("Not a blob (actually a %s): %s",
+				command_buf.buf, type);
 	}
 
-	tree_content_set(&b->branch_tree, path, sha1, S_IFREG | 0644);
+	tree_content_set(&b->branch_tree, p, sha1, S_IFREG | mode);
+
+	if (p_uq)
+		free(p_uq);
 }
 
 static void file_change_d(struct branch *b)
 {
-	tree_content_remove(&b->branch_tree, read_path());
+	const char *p = command_buf.buf + 2;
+	char *p_uq;
+	const char *endp;
+
+	p_uq = unquote_c_style(p, &endp);
+	if (p_uq) {
+		if (*endp)
+			die("Garbage after path in: %s", command_buf.buf);
+		p = p_uq;
+	}
+	tree_content_remove(&b->branch_tree, p);
+	if (p_uq)
+		free(p_uq);
 }
 
 static void cmd_new_commit()
 {
-	static const unsigned int max_hdr_len = 94;
-	const char *name = read_path();
-	struct branch *b = lookup_branch(name);
-	unsigned int acmsglen;
-	char *body, *c;
-
+	struct branch *b;
+	void *msg;
+	size_t msglen;
+	char *str_uq;
+	const char *endp;
+	char *sp;
+	char *author = NULL;
+	char *committer = NULL;
+	char *body;
+
+	/* Obtain the branch name from the rest of our command */
+	sp = strchr(command_buf.buf, ' ') + 1;
+	str_uq = unquote_c_style(sp, &endp);
+	if (str_uq) {
+		if (*endp)
+			die("Garbage after ref in: %s", command_buf.buf);
+		sp = str_uq;
+	}
+	b = lookup_branch(sp);
 	if (!b)
-		die("Branch not declared: %s", name);
+		die("Branch not declared: %s", sp);
+	if (str_uq)
+		free(str_uq);
+
+	read_next_command();
+	cmd_mark();
+	if (!strncmp("author ", command_buf.buf, 7)) {
+		author = strdup(command_buf.buf);
+		read_next_command();
+	}
+	if (!strncmp("committer ", command_buf.buf, 10)) {
+		committer = strdup(command_buf.buf);
+		read_next_command();
+	}
+	if (!committer)
+		die("Expected committer but didn't get one");
+	msg = cmd_data(&msglen);
+
+	/* ensure the branch is active/loaded */
 	if (!b->branch_tree.tree) {
 		unload_one_branch();
 		load_branch(b);
 	}
 
-	/* author_committer_msg */
-	yread(0, &acmsglen, 4);
-	body = xmalloc(acmsglen + max_hdr_len);
-	c = body + max_hdr_len;
-	yread(0, c, acmsglen);
-
-	/* oddly enough this is all that fsck-objects cares about */
-	if (memcmp(c, "author ", 7))
-		die("Invalid commit format on branch %s", name);
-
 	/* file_change* */
 	for (;;) {
-		unsigned char cmd;
-		yread(0, &cmd, 1);
-		if (cmd == '0')
+		read_next_command();
+		if (1 == command_buf.len)
 			break;
-		else if (cmd == 'M')
+		else if (!strncmp("M ", command_buf.buf, 2))
 			file_change_m(b);
-		else if (cmd == 'D')
+		else if (!strncmp("D ", command_buf.buf, 2))
 			file_change_d(b);
 		else
-			die("Unsupported file_change: %c", cmd);
+			die("Unsupported file_change: %s", command_buf.buf);
 	}
 
-	if (memcmp(b->sha1, null_sha1, 20)) {
-		sprintf(c - 48, "parent %s", sha1_to_hex(b->sha1));
-		*(c - 1) = '\n';
-		c -= 48;
-	}
+	/* build the tree and the commit */
 	store_tree(&b->branch_tree);
-	sprintf(c - 46, "tree %s", sha1_to_hex(b->branch_tree.sha1));
-	*(c - 1) = '\n';
-	c -= 46;
-
-	store_object(OBJ_COMMIT,
-		c, (body + max_hdr_len + acmsglen) - c,
-		NULL, b->sha1);
+	body = xmalloc(97 + msglen
+		+ (author
+			? strlen(author) + strlen(committer)
+			: 2 * strlen(committer)));
+	sp = body;
+	sp += sprintf(sp, "tree %s\n", sha1_to_hex(b->branch_tree.sha1));
+	if (memcmp(b->sha1, null_sha1, 20))
+		sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1));
+	if (author)
+		sp += sprintf(sp, "%s\n", author);
+	else
+		sp += sprintf(sp, "author %s\n", committer + 10);
+	sp += sprintf(sp, "%s\n\n", committer);
+	memcpy(sp, msg, msglen);
+	sp += msglen;
+	if (author)
+		free(author);
+	free(committer);
+	free(msg);
+
+	store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1);
 	free(body);
 	b->last_commit = object_count_by_type[OBJ_COMMIT];
 }
 
 static void cmd_new_branch()
 {
-	struct branch *b = new_branch(read_path());
-	const char *base = read_path();
-	struct branch *s = lookup_branch(base);
-
-	if (!strcmp(b->name, base))
-		die("Can't create a branch from itself: %s", base);
-	else if (s) {
-		memcpy(b->sha1, s->sha1, 20);
-		memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20);
+	struct branch *b;
+	char *str_uq;
+	const char *endp;
+	char *sp;
+
+	/* Obtain the new branch name from the rest of our command */
+	sp = strchr(command_buf.buf, ' ') + 1;
+	str_uq = unquote_c_style(sp, &endp);
+	if (str_uq) {
+		if (*endp)
+			die("Garbage after ref in: %s", command_buf.buf);
+		sp = str_uq;
 	}
-	else if (!get_sha1(base, b->sha1)) {
-		if (!memcmp(b->sha1, null_sha1, 20))
-			memcpy(b->branch_tree.sha1, null_sha1, 20);
-		else {
-			unsigned long size;
-			char *buf;
-
-			buf = read_object_with_reference(b->sha1,
-				type_names[OBJ_COMMIT], &size, b->sha1);
-			if (!buf || size < 46)
-				die("Not a valid commit: %s", base);
-			if (memcmp("tree ", buf, 5)
-				|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
-				die("The commit %s is corrupt", sha1_to_hex(b->sha1));
-			free(buf);
+	b = new_branch(sp);
+	if (str_uq)
+		free(str_uq);
+	read_next_command();
+
+	/* from ... */
+	if (!strncmp("from ", command_buf.buf, 5)) {
+		const char *from;
+		struct branch *s;
+
+		from = strchr(command_buf.buf, ' ') + 1;
+		str_uq = unquote_c_style(from, &endp);
+		if (str_uq) {
+			if (*endp)
+				die("Garbage after string in: %s", command_buf.buf);
+			from = str_uq;
 		}
-	} else
-		die("Not a SHA1 or branch: %s", base);
+
+		s = lookup_branch(from);
+		if (b == s)
+			die("Can't create a branch from itself: %s", b->name);
+		else if (s) {
+			memcpy(b->sha1, s->sha1, 20);
+			memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20);
+		} else if (!get_sha1(from, b->sha1)) {
+			if (!memcmp(b->sha1, null_sha1, 20))
+				memcpy(b->branch_tree.sha1, null_sha1, 20);
+			else {
+				unsigned long size;
+				char *buf;
+
+				buf = read_object_with_reference(b->sha1,
+					type_names[OBJ_COMMIT], &size, b->sha1);
+				if (!buf || size < 46)
+					die("Not a valid commit: %s", from);
+				if (memcmp("tree ", buf, 5)
+					|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
+					die("The commit %s is corrupt", sha1_to_hex(b->sha1));
+				free(buf);
+			}
+		} else
+			die("Invalid ref name or SHA1 expression: %s", from);
+
+		if (str_uq)
+			free(str_uq);
+		read_next_command();
+	} else {
+		memcpy(b->sha1, null_sha1, 20);
+		memcpy(b->branch_tree.sha1, null_sha1, 20);
+	}
+
+	if (command_buf.eof || command_buf.len > 1)
+		die("An lf did not terminate the branch command as expected.");
 }
 
 int main(int argc, const char **argv)
@@ -1087,26 +1251,28 @@ int main(int argc, const char **argv)
 	if (pack_fd < 0)
 		die("Can't create %s: %s", pack_name, strerror(errno));
 
+	init_pack_header();
 	alloc_objects(est_obj_cnt);
+	strbuf_init(&command_buf);
 
 	atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*));
 	branch_table = xcalloc(branch_table_sz, sizeof(struct branch*));
 	avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*));
 
-	init_pack_header();
 	for (;;) {
-		unsigned long cmd;
-		if (optional_read(0, &cmd, 4))
+		read_next_command();
+		if (command_buf.eof)
 			break;
-
-		switch (ntohl(cmd)) {
-		case 'blob': cmd_new_blob();   break;
-		case 'comt': cmd_new_commit(); break;
-		case 'brch': cmd_new_branch(); break;
-		default:
-			die("Invalid command %lu", cmd);
-		}
+		else if (!strcmp("blob", command_buf.buf))
+			cmd_new_blob();
+		else if (!strncmp("branch ", command_buf.buf, 7))
+			cmd_new_branch();
+		else if (!strncmp("commit ", command_buf.buf, 7))
+			cmd_new_commit();
+		else
+			die("Unsupported command: %s", command_buf.buf);
 	}
+
 	fixup_header_footer();
 	close(pack_fd);
 	write_index(idx_name);
author	Shawn O. Pearce <spearce@spearce.org>	2006-08-15 04:16:28 +0400
committer	Shawn O. Pearce <spearce@spearce.org>	2007-01-14 10:15:04 +0300
commit	c44cdc7eef212ec09901eb2e0996476e0468ed88 (patch)
tree	ab64a49e8719583156fb7a57281313db61cc60e6 /fast-import.c
parent	7111feede9c5905199ba48645fadc369faca5711 (diff)