From c0ad465725302b9411e53d248871c36880b6f8fd Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 28 Oct 2011 11:40:48 -0700 Subject: write_pack_header(): a helper function Factor out a small logic out of the private write_pack_file() function in builtin/pack-objects.c Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 9 +++------ pack-write.c | 12 ++++++++++++ pack.h | 2 ++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index ba3705d1de..6643c1657a 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -571,7 +571,6 @@ static void write_pack_file(void) uint32_t i = 0, j; struct sha1file *f; off_t offset; - struct pack_header hdr; uint32_t nr_remaining = nr_result; time_t last_mtime = 0; struct object_entry **write_order; @@ -596,11 +595,9 @@ static void write_pack_file(void) f = sha1fd(fd, pack_tmp_name); } - hdr.hdr_signature = htonl(PACK_SIGNATURE); - hdr.hdr_version = htonl(PACK_VERSION); - hdr.hdr_entries = htonl(nr_remaining); - sha1write(f, &hdr, sizeof(hdr)); - offset = sizeof(hdr); + offset = write_pack_header(f, nr_remaining); + if (!offset) + die_errno("unable to write pack header"); nr_written = 0; for (; i < nr_objects; i++) { struct object_entry *e = write_order[i]; diff --git a/pack-write.c b/pack-write.c index 9cd3bfbb4b..46f3f846be 100644 --- a/pack-write.c +++ b/pack-write.c @@ -178,6 +178,18 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec return index_name; } +off_t write_pack_header(struct sha1file *f, uint32_t nr_entries) +{ + struct pack_header hdr; + + hdr.hdr_signature = htonl(PACK_SIGNATURE); + hdr.hdr_version = htonl(PACK_VERSION); + hdr.hdr_entries = htonl(nr_entries); + if (sha1write(f, &hdr, sizeof(hdr))) + return 0; + return sizeof(hdr); +} + /* * Update pack header with object_count and compute new SHA1 for pack data * associated to pack_fd, and write that SHA1 at the end. That new SHA1 diff --git a/pack.h b/pack.h index 722a54e00a..d429d8a668 100644 --- a/pack.h +++ b/pack.h @@ -2,6 +2,7 @@ #define PACK_H #include "object.h" +#include "csum-file.h" /* * Packed object header @@ -74,6 +75,7 @@ extern const char *write_idx_file(const char *index_name, struct pack_idx_entry extern int check_pack_crc(struct packed_git *p, struct pack_window **w_curs, off_t offset, off_t len, unsigned int nr); extern int verify_pack_index(struct packed_git *); extern int verify_pack(struct packed_git *); +extern off_t write_pack_header(struct sha1file *f, uint32_t); extern void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t); extern char *index_pack_lockfile(int fd); extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *); -- cgit v1.2.3 From cdf9db3c83618453809d6a584d1a0db19b3e189f Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 28 Oct 2011 11:52:14 -0700 Subject: create_tmp_packfile(): a helper function Factor out a small logic out of the private write_pack_file() function in builtin/pack-objects.c Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 12 +++--------- pack-write.c | 10 ++++++++++ pack.h | 3 +++ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 6643c1657a..3258fa96e8 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -584,16 +584,10 @@ static void write_pack_file(void) unsigned char sha1[20]; char *pack_tmp_name = NULL; - if (pack_to_stdout) { + if (pack_to_stdout) f = sha1fd_throughput(1, "", progress_state); - } else { - char tmpname[PATH_MAX]; - int fd; - fd = odb_mkstemp(tmpname, sizeof(tmpname), - "pack/tmp_pack_XXXXXX"); - pack_tmp_name = xstrdup(tmpname); - f = sha1fd(fd, pack_tmp_name); - } + else + f = create_tmp_packfile(&pack_tmp_name); offset = write_pack_header(f, nr_remaining); if (!offset) diff --git a/pack-write.c b/pack-write.c index 46f3f846be..863cce8a09 100644 --- a/pack-write.c +++ b/pack-write.c @@ -328,3 +328,13 @@ int encode_in_pack_object_header(enum object_type type, uintmax_t size, unsigned *hdr = c; return n; } + +struct sha1file *create_tmp_packfile(char **pack_tmp_name) +{ + char tmpname[PATH_MAX]; + int fd; + + fd = odb_mkstemp(tmpname, sizeof(tmpname), "pack/tmp_pack_XXXXXX"); + *pack_tmp_name = xstrdup(tmpname); + return sha1fd(fd, *pack_tmp_name); +} diff --git a/pack.h b/pack.h index d429d8a668..0027ac6149 100644 --- a/pack.h +++ b/pack.h @@ -84,4 +84,7 @@ extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned ch #define PH_ERROR_PACK_SIGNATURE (-2) #define PH_ERROR_PROTOCOL (-3) extern int read_pack_header(int fd, struct pack_header *); + +extern struct sha1file *create_tmp_packfile(char **pack_tmp_name); + #endif -- cgit v1.2.3 From 0e990530ae6d6c6805d31c666953541f762dd402 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 28 Oct 2011 12:34:09 -0700 Subject: finish_tmp_packfile(): a helper function Factor out a small logic out of the private write_pack_file() function in builtin/pack-objects.c. This changes the order of finishing multi-pack generation slightly. The code used to - adjust shared perm of temporary packfile - rename temporary packfile to the final name - update mtime of the packfile under the final name - adjust shared perm of temporary idxfile - rename temporary idxfile to the final name but because the helper does not want to do the mtime thing, the updated code does that step first and then all the rest. Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 33 ++++++++++----------------------- pack-write.c | 31 +++++++++++++++++++++++++++++++ pack.h | 1 + 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 3258fa96e8..b458b6d3f4 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -617,20 +617,8 @@ static void write_pack_file(void) if (!pack_to_stdout) { struct stat st; - const char *idx_tmp_name; char tmpname[PATH_MAX]; - idx_tmp_name = write_idx_file(NULL, written_list, nr_written, - &pack_idx_opts, sha1); - - snprintf(tmpname, sizeof(tmpname), "%s-%s.pack", - base_name, sha1_to_hex(sha1)); - free_pack_by_name(tmpname); - if (adjust_shared_perm(pack_tmp_name)) - die_errno("unable to make temporary pack file readable"); - if (rename(pack_tmp_name, tmpname)) - die_errno("unable to rename temporary pack file"); - /* * Packs are runtime accessed in their mtime * order since newer packs are more likely to contain @@ -638,28 +626,27 @@ static void write_pack_file(void) * packs then we should modify the mtime of later ones * to preserve this property. */ - if (stat(tmpname, &st) < 0) { + if (stat(pack_tmp_name, &st) < 0) { warning("failed to stat %s: %s", - tmpname, strerror(errno)); + pack_tmp_name, strerror(errno)); } else if (!last_mtime) { last_mtime = st.st_mtime; } else { struct utimbuf utb; utb.actime = st.st_atime; utb.modtime = --last_mtime; - if (utime(tmpname, &utb) < 0) + if (utime(pack_tmp_name, &utb) < 0) warning("failed utime() on %s: %s", tmpname, strerror(errno)); } - snprintf(tmpname, sizeof(tmpname), "%s-%s.idx", - base_name, sha1_to_hex(sha1)); - if (adjust_shared_perm(idx_tmp_name)) - die_errno("unable to make temporary index file readable"); - if (rename(idx_tmp_name, tmpname)) - die_errno("unable to rename temporary index file"); - - free((void *) idx_tmp_name); + /* Enough space for "-.pack"? */ + if (sizeof(tmpname) <= strlen(base_name) + 50) + die("pack base name '%s' too long", base_name); + snprintf(tmpname, sizeof(tmpname), "%s-", base_name); + finish_tmp_packfile(tmpname, pack_tmp_name, + written_list, nr_written, + &pack_idx_opts, sha1); free(pack_tmp_name); puts(sha1_to_hex(sha1)); } diff --git a/pack-write.c b/pack-write.c index 863cce8a09..cadc3e1adf 100644 --- a/pack-write.c +++ b/pack-write.c @@ -338,3 +338,34 @@ struct sha1file *create_tmp_packfile(char **pack_tmp_name) *pack_tmp_name = xstrdup(tmpname); return sha1fd(fd, *pack_tmp_name); } + +void finish_tmp_packfile(char *name_buffer, + const char *pack_tmp_name, + struct pack_idx_entry **written_list, + uint32_t nr_written, + struct pack_idx_option *pack_idx_opts, + unsigned char sha1[]) +{ + const char *idx_tmp_name; + char *end_of_name_prefix = strrchr(name_buffer, 0); + + if (adjust_shared_perm(pack_tmp_name)) + die_errno("unable to make temporary pack file readable"); + + idx_tmp_name = write_idx_file(NULL, written_list, nr_written, + pack_idx_opts, sha1); + if (adjust_shared_perm(idx_tmp_name)) + die_errno("unable to make temporary index file readable"); + + sprintf(end_of_name_prefix, "%s.pack", sha1_to_hex(sha1)); + free_pack_by_name(name_buffer); + + if (rename(pack_tmp_name, name_buffer)) + die_errno("unable to rename temporary pack file"); + + sprintf(end_of_name_prefix, "%s.idx", sha1_to_hex(sha1)); + if (rename(idx_tmp_name, name_buffer)) + die_errno("unable to rename temporary index file"); + + free((void *)idx_tmp_name); +} diff --git a/pack.h b/pack.h index 0027ac6149..cfb0f6976d 100644 --- a/pack.h +++ b/pack.h @@ -86,5 +86,6 @@ extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned ch extern int read_pack_header(int fd, struct pack_header *); extern struct sha1file *create_tmp_packfile(char **pack_tmp_name); +extern void finish_tmp_packfile(char *name_buffer, const char *pack_tmp_name, struct pack_idx_entry **written_list, uint32_t nr_written, struct pack_idx_option *pack_idx_opts, unsigned char sha1[]); #endif -- cgit v1.2.3 From 6c526148644475acbc047faaeb57d42fde1f667d Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 17 Nov 2011 16:26:54 -0800 Subject: csum-file: introduce sha1file_checkpoint It is useful to be able to rewind a check-summed file to a certain previous state after writing data into it using sha1write() API. The fast-import command does this after streaming a blob data to the packfile being generated and then noticing that the same blob has already been written, and it does this with a private code truncate_pack() that is commented as "Yes, this is a layering violation". Introduce two API functions, sha1file_checkpoint(), that allows the caller to save a state of a sha1file, and then later revert it to the saved state. Use it to reimplement truncate_pack(). Signed-off-by: Junio C Hamano --- csum-file.c | 20 ++++++++++++++++++++ csum-file.h | 9 +++++++++ fast-import.c | 25 ++++++++----------------- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/csum-file.c b/csum-file.c index fc97d6e045..53f5375b6c 100644 --- a/csum-file.c +++ b/csum-file.c @@ -158,6 +158,26 @@ struct sha1file *sha1fd_throughput(int fd, const char *name, struct progress *tp return f; } +void sha1file_checkpoint(struct sha1file *f, struct sha1file_checkpoint *checkpoint) +{ + sha1flush(f); + checkpoint->offset = f->total; + checkpoint->ctx = f->ctx; +} + +int sha1file_truncate(struct sha1file *f, struct sha1file_checkpoint *checkpoint) +{ + off_t offset = checkpoint->offset; + + if (ftruncate(f->fd, offset) || + lseek(f->fd, offset, SEEK_SET) != offset) + return -1; + f->total = offset; + f->ctx = checkpoint->ctx; + f->offset = 0; /* sha1flush() was called in checkpoint */ + return 0; +} + void crc32_begin(struct sha1file *f) { f->crc32 = crc32(0, NULL, 0); diff --git a/csum-file.h b/csum-file.h index 6a7967c6bf..3b540bdc21 100644 --- a/csum-file.h +++ b/csum-file.h @@ -17,6 +17,15 @@ struct sha1file { unsigned char buffer[8192]; }; +/* Checkpoint */ +struct sha1file_checkpoint { + off_t offset; + git_SHA_CTX ctx; +}; + +extern void sha1file_checkpoint(struct sha1file *, struct sha1file_checkpoint *); +extern int sha1file_truncate(struct sha1file *, struct sha1file_checkpoint *); + /* sha1close flags */ #define CSUM_CLOSE 1 #define CSUM_FSYNC 2 diff --git a/fast-import.c b/fast-import.c index 8d8ea3c45c..a8db41b1df 100644 --- a/fast-import.c +++ b/fast-import.c @@ -1143,17 +1143,11 @@ static int store_object( return 0; } -static void truncate_pack(off_t to, git_SHA_CTX *ctx) +static void truncate_pack(struct sha1file_checkpoint *checkpoint) { - if (ftruncate(pack_data->pack_fd, to) - || lseek(pack_data->pack_fd, to, SEEK_SET) != to) + if (sha1file_truncate(pack_file, checkpoint)) die_errno("cannot truncate pack to skip duplicate"); - pack_size = to; - - /* yes this is a layering violation */ - pack_file->total = to; - pack_file->offset = 0; - pack_file->ctx = *ctx; + pack_size = checkpoint->offset; } static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark) @@ -1166,8 +1160,8 @@ static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark) unsigned long hdrlen; off_t offset; git_SHA_CTX c; - git_SHA_CTX pack_file_ctx; git_zstream s; + struct sha1file_checkpoint checkpoint; int status = Z_OK; /* Determine if we should auto-checkpoint. */ @@ -1175,11 +1169,8 @@ static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark) || (pack_size + 60 + len) < pack_size) cycle_packfile(); - offset = pack_size; - - /* preserve the pack_file SHA1 ctx in case we have to truncate later */ - sha1flush(pack_file); - pack_file_ctx = pack_file->ctx; + sha1file_checkpoint(pack_file, &checkpoint); + offset = checkpoint.offset; hdrlen = snprintf((char *)out_buf, out_sz, "blob %" PRIuMAX, len) + 1; if (out_sz <= hdrlen) @@ -1245,14 +1236,14 @@ static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark) if (e->idx.offset) { duplicate_count_by_type[OBJ_BLOB]++; - truncate_pack(offset, &pack_file_ctx); + truncate_pack(&checkpoint); } else if (find_sha1_pack(sha1, packed_git)) { e->type = OBJ_BLOB; e->pack_id = MAX_PACK_ID; e->idx.offset = 1; /* just not zero! */ duplicate_count_by_type[OBJ_BLOB]++; - truncate_pack(offset, &pack_file_ctx); + truncate_pack(&checkpoint); } else { e->depth = 0; -- cgit v1.2.3 From 568508e76570e9ea36aad6446959424cebcf0535 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 28 Oct 2011 14:48:40 -0700 Subject: bulk-checkin: replace fast-import based implementation This extends the earlier approach to stream a large file directly from the filesystem to its own packfile, and allows "git add" to send large files directly into a single pack. Older code used to spawn fast-import, but the new bulk-checkin API replaces it. Signed-off-by: Junio C Hamano --- Makefile | 2 + builtin/add.c | 5 + builtin/pack-objects.c | 6 +- bulk-checkin.c | 275 +++++++++++++++++++++++++++++++++++++++++++++++++ bulk-checkin.h | 16 +++ cache.h | 2 + config.c | 4 + environment.c | 1 + sha1_file.c | 67 +----------- t/t1050-large.sh | 94 +++++++++++++++-- zlib.c | 9 +- 11 files changed, 403 insertions(+), 78 deletions(-) create mode 100644 bulk-checkin.c create mode 100644 bulk-checkin.h diff --git a/Makefile b/Makefile index 3139c19b10..418dd2e3e6 100644 --- a/Makefile +++ b/Makefile @@ -505,6 +505,7 @@ LIB_H += argv-array.h LIB_H += attr.h LIB_H += blob.h LIB_H += builtin.h +LIB_H += bulk-checkin.h LIB_H += cache.h LIB_H += cache-tree.h LIB_H += color.h @@ -591,6 +592,7 @@ LIB_OBJS += base85.o LIB_OBJS += bisect.o LIB_OBJS += blob.o LIB_OBJS += branch.o +LIB_OBJS += bulk-checkin.o LIB_OBJS += bundle.o LIB_OBJS += cache-tree.o LIB_OBJS += color.o diff --git a/builtin/add.c b/builtin/add.c index c59b0c98fe..1c42900ff8 100644 --- a/builtin/add.c +++ b/builtin/add.c @@ -13,6 +13,7 @@ #include "diff.h" #include "diffcore.h" #include "revision.h" +#include "bulk-checkin.h" static const char * const builtin_add_usage[] = { "git add [options] [--] ...", @@ -458,11 +459,15 @@ int cmd_add(int argc, const char **argv, const char *prefix) free(seen); } + plug_bulk_checkin(); + exit_status |= add_files_to_cache(prefix, pathspec, flags); if (add_new_files) exit_status |= add_files(&dir, flags); + unplug_bulk_checkin(); + finish: if (active_cache_changed) { if (write_cache(newfd, active_cache, active_nr) || diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index b458b6d3f4..dde913ebb4 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -76,7 +76,7 @@ static struct pack_idx_option pack_idx_opts; static const char *base_name; static int progress = 1; static int window = 10; -static unsigned long pack_size_limit, pack_size_limit_cfg; +static unsigned long pack_size_limit; static int depth = 50; static int delta_search_threads; static int pack_to_stdout; @@ -2009,10 +2009,6 @@ static int git_pack_config(const char *k, const char *v, void *cb) pack_idx_opts.version); return 0; } - if (!strcmp(k, "pack.packsizelimit")) { - pack_size_limit_cfg = git_config_ulong(k, v); - return 0; - } return git_default_config(k, v, cb); } diff --git a/bulk-checkin.c b/bulk-checkin.c new file mode 100644 index 0000000000..6b0b6d4904 --- /dev/null +++ b/bulk-checkin.c @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2011, Google Inc. + */ +#include "bulk-checkin.h" +#include "csum-file.h" +#include "pack.h" + +static int pack_compression_level = Z_DEFAULT_COMPRESSION; + +static struct bulk_checkin_state { + unsigned plugged:1; + + char *pack_tmp_name; + struct sha1file *f; + off_t offset; + struct pack_idx_option pack_idx_opts; + + struct pack_idx_entry **written; + uint32_t alloc_written; + uint32_t nr_written; +} state; + +static void finish_bulk_checkin(struct bulk_checkin_state *state) +{ + unsigned char sha1[20]; + char packname[PATH_MAX]; + int i; + + if (!state->f) + return; + + if (state->nr_written == 0) { + close(state->f->fd); + unlink(state->pack_tmp_name); + goto clear_exit; + } else if (state->nr_written == 1) { + sha1close(state->f, sha1, CSUM_FSYNC); + } else { + int fd = sha1close(state->f, sha1, 0); + fixup_pack_header_footer(fd, sha1, state->pack_tmp_name, + state->nr_written, sha1, + state->offset); + close(fd); + } + + sprintf(packname, "%s/pack/pack-", get_object_directory()); + finish_tmp_packfile(packname, state->pack_tmp_name, + state->written, state->nr_written, + &state->pack_idx_opts, sha1); + for (i = 0; i < state->nr_written; i++) + free(state->written[i]); + +clear_exit: + free(state->written); + memset(state, 0, sizeof(*state)); + + /* Make objects we just wrote available to ourselves */ + reprepare_packed_git(); +} + +static int already_written(struct bulk_checkin_state *state, unsigned char sha1[]) +{ + int i; + + /* The object may already exist in the repository */ + if (has_sha1_file(sha1)) + return 1; + + /* Might want to keep the list sorted */ + for (i = 0; i < state->nr_written; i++) + if (!hashcmp(state->written[i]->sha1, sha1)) + return 1; + + /* This is a new object we need to keep */ + return 0; +} + +/* + * Read the contents from fd for size bytes, streaming it to the + * packfile in state while updating the hash in ctx. Signal a failure + * by returning a negative value when the resulting pack would exceed + * the pack size limit and this is not the first object in the pack, + * so that the caller can discard what we wrote from the current pack + * by truncating it and opening a new one. The caller will then call + * us again after rewinding the input fd. + * + * The already_hashed_to pointer is kept untouched by the caller to + * make sure we do not hash the same byte when we are called + * again. This way, the caller does not have to checkpoint its hash + * status before calling us just in case we ask it to call us again + * with a new pack. + */ +static int stream_to_pack(struct bulk_checkin_state *state, + git_SHA_CTX *ctx, off_t *already_hashed_to, + int fd, size_t size, enum object_type type, + const char *path, unsigned flags) +{ + git_zstream s; + unsigned char obuf[16384]; + unsigned hdrlen; + int status = Z_OK; + int write_object = (flags & HASH_WRITE_OBJECT); + off_t offset = 0; + + memset(&s, 0, sizeof(s)); + git_deflate_init(&s, pack_compression_level); + + hdrlen = encode_in_pack_object_header(type, size, obuf); + s.next_out = obuf + hdrlen; + s.avail_out = sizeof(obuf) - hdrlen; + + while (status != Z_STREAM_END) { + unsigned char ibuf[16384]; + + if (size && !s.avail_in) { + ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf); + if (xread(fd, ibuf, rsize) != rsize) + die("failed to read %d bytes from '%s'", + (int)rsize, path); + offset += rsize; + if (*already_hashed_to < offset) { + size_t hsize = offset - *already_hashed_to; + if (rsize < hsize) + hsize = rsize; + if (hsize) + git_SHA1_Update(ctx, ibuf, hsize); + *already_hashed_to = offset; + } + s.next_in = ibuf; + s.avail_in = rsize; + size -= rsize; + } + + status = git_deflate(&s, size ? 0 : Z_FINISH); + + if (!s.avail_out || status == Z_STREAM_END) { + if (write_object) { + size_t written = s.next_out - obuf; + + /* would we bust the size limit? */ + if (state->nr_written && + pack_size_limit_cfg && + pack_size_limit_cfg < state->offset + written) { + git_deflate_abort(&s); + return -1; + } + + sha1write(state->f, obuf, written); + state->offset += written; + } + s.next_out = obuf; + s.avail_out = sizeof(obuf); + } + + switch (status) { + case Z_OK: + case Z_BUF_ERROR: + case Z_STREAM_END: + continue; + default: + die("unexpected deflate failure: %d", status); + } + } + git_deflate_end(&s); + return 0; +} + +/* Lazily create backing packfile for the state */ +static void prepare_to_stream(struct bulk_checkin_state *state, + unsigned flags) +{ + if (!(flags & HASH_WRITE_OBJECT) || state->f) + return; + + state->f = create_tmp_packfile(&state->pack_tmp_name); + reset_pack_idx_option(&state->pack_idx_opts); + + /* Pretend we are going to write only one object */ + state->offset = write_pack_header(state->f, 1); + if (!state->offset) + die_errno("unable to write pack header"); +} + +static int deflate_to_pack(struct bulk_checkin_state *state, + unsigned char result_sha1[], + int fd, size_t size, + enum object_type type, const char *path, + unsigned flags) +{ + off_t seekback, already_hashed_to; + git_SHA_CTX ctx; + unsigned char obuf[16384]; + unsigned header_len; + struct sha1file_checkpoint checkpoint; + struct pack_idx_entry *idx = NULL; + + seekback = lseek(fd, 0, SEEK_CUR); + if (seekback == (off_t) -1) + return error("cannot find the current offset"); + + header_len = sprintf((char *)obuf, "%s %" PRIuMAX, + typename(type), (uintmax_t)size) + 1; + git_SHA1_Init(&ctx); + git_SHA1_Update(&ctx, obuf, header_len); + + /* Note: idx is non-NULL when we are writing */ + if ((flags & HASH_WRITE_OBJECT) != 0) + idx = xcalloc(1, sizeof(*idx)); + + already_hashed_to = 0; + + while (1) { + prepare_to_stream(state, flags); + if (idx) { + sha1file_checkpoint(state->f, &checkpoint); + idx->offset = state->offset; + crc32_begin(state->f); + } + if (!stream_to_pack(state, &ctx, &already_hashed_to, + fd, size, type, path, flags)) + break; + /* + * Writing this object to the current pack will make + * it too big; we need to truncate it, start a new + * pack, and write into it. + */ + if (!idx) + die("BUG: should not happen"); + sha1file_truncate(state->f, &checkpoint); + state->offset = checkpoint.offset; + finish_bulk_checkin(state); + if (lseek(fd, seekback, SEEK_SET) == (off_t) -1) + return error("cannot seek back"); + } + git_SHA1_Final(result_sha1, &ctx); + if (!idx) + return 0; + + idx->crc32 = crc32_end(state->f); + if (already_written(state, result_sha1)) { + sha1file_truncate(state->f, &checkpoint); + state->offset = checkpoint.offset; + free(idx); + } else { + hashcpy(idx->sha1, result_sha1); + ALLOC_GROW(state->written, + state->nr_written + 1, + state->alloc_written); + state->written[state->nr_written++] = idx; + } + return 0; +} + +int index_bulk_checkin(unsigned char *sha1, + int fd, size_t size, enum object_type type, + const char *path, unsigned flags) +{ + int status = deflate_to_pack(&state, sha1, fd, size, type, + path, flags); + if (!state.plugged) + finish_bulk_checkin(&state); + return status; +} + +void plug_bulk_checkin(void) +{ + state.plugged = 1; +} + +void unplug_bulk_checkin(void) +{ + state.plugged = 0; + if (state.f) + finish_bulk_checkin(&state); +} diff --git a/bulk-checkin.h b/bulk-checkin.h new file mode 100644 index 0000000000..4f599f8841 --- /dev/null +++ b/bulk-checkin.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2011, Google Inc. + */ +#ifndef BULK_CHECKIN_H +#define BULK_CHECKIN_H + +#include "cache.h" + +extern int index_bulk_checkin(unsigned char sha1[], + int fd, size_t size, enum object_type type, + const char *path, unsigned flags); + +extern void plug_bulk_checkin(void); +extern void unplug_bulk_checkin(void); + +#endif diff --git a/cache.h b/cache.h index 2e6ad3604e..4f2086151c 100644 --- a/cache.h +++ b/cache.h @@ -35,6 +35,7 @@ int git_inflate(git_zstream *, int flush); void git_deflate_init(git_zstream *, int level); void git_deflate_init_gzip(git_zstream *, int level); void git_deflate_end(git_zstream *); +int git_deflate_abort(git_zstream *); int git_deflate_end_gently(git_zstream *); int git_deflate(git_zstream *, int flush); unsigned long git_deflate_bound(git_zstream *, unsigned long); @@ -598,6 +599,7 @@ extern size_t packed_git_window_size; extern size_t packed_git_limit; extern size_t delta_base_cache_limit; extern unsigned long big_file_threshold; +extern unsigned long pack_size_limit_cfg; extern int read_replace_refs; extern int fsync_object_files; extern int core_preload_index; diff --git a/config.c b/config.c index edf9914df6..c736802057 100644 --- a/config.c +++ b/config.c @@ -797,6 +797,10 @@ int git_default_config(const char *var, const char *value, void *dummy) return 0; } + if (!strcmp(var, "pack.packsizelimit")) { + pack_size_limit_cfg = git_config_ulong(var, value); + return 0; + } /* Add other config variables here and to Documentation/config.txt. */ return 0; } diff --git a/environment.c b/environment.c index 0bee6a7a88..31e42847b1 100644 --- a/environment.c +++ b/environment.c @@ -60,6 +60,7 @@ char *notes_ref_name; int grafts_replace_parents = 1; int core_apply_sparse_checkout; struct startup_info *startup_info; +unsigned long pack_size_limit_cfg; /* Parallel index stat data preload? */ int core_preload_index = 0; diff --git a/sha1_file.c b/sha1_file.c index 27f3b9b278..c96e366b76 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -18,6 +18,7 @@ #include "refs.h" #include "pack-revindex.h" #include "sha1-lookup.h" +#include "bulk-checkin.h" #ifndef O_NOATIME #if defined(__linux__) && (defined(__i386__) || defined(__PPC__)) @@ -2679,10 +2680,8 @@ static int index_core(unsigned char *sha1, int fd, size_t size, } /* - * This creates one packfile per large blob, because the caller - * immediately wants the result sha1, and fast-import can report the - * object name via marks mechanism only by closing the created - * packfile. + * This creates one packfile per large blob unless bulk-checkin + * machinery is "plugged". * * This also bypasses the usual "convert-to-git" dance, and that is on * purpose. We could write a streaming version of the converting @@ -2696,65 +2695,7 @@ static int index_stream(unsigned char *sha1, int fd, size_t size, enum object_type type, const char *path, unsigned flags) { - struct child_process fast_import; - char export_marks[512]; - const char *argv[] = { "fast-import", "--quiet", export_marks, NULL }; - char tmpfile[512]; - char fast_import_cmd[512]; - char buf[512]; - int len, tmpfd; - - strcpy(tmpfile, git_path("hashstream_XXXXXX")); - tmpfd = git_mkstemp_mode(tmpfile, 0600); - if (tmpfd < 0) - die_errno("cannot create tempfile: %s", tmpfile); - if (close(tmpfd)) - die_errno("cannot close tempfile: %s", tmpfile); - sprintf(export_marks, "--export-marks=%s", tmpfile); - - memset(&fast_import, 0, sizeof(fast_import)); - fast_import.in = -1; - fast_import.argv = argv; - fast_import.git_cmd = 1; - if (start_command(&fast_import)) - die_errno("index-stream: git fast-import failed"); - - len = sprintf(fast_import_cmd, "blob\nmark :1\ndata %lu\n", - (unsigned long) size); - write_or_whine(fast_import.in, fast_import_cmd, len, - "index-stream: feeding fast-import"); - while (size) { - char buf[10240]; - size_t sz = size < sizeof(buf) ? size : sizeof(buf); - ssize_t actual; - - actual = read_in_full(fd, buf, sz); - if (actual < 0) - die_errno("index-stream: reading input"); - if (write_in_full(fast_import.in, buf, actual) != actual) - die_errno("index-stream: feeding fast-import"); - size -= actual; - } - if (close(fast_import.in)) - die_errno("index-stream: closing fast-import"); - if (finish_command(&fast_import)) - die_errno("index-stream: finishing fast-import"); - - tmpfd = open(tmpfile, O_RDONLY); - if (tmpfd < 0) - die_errno("index-stream: cannot open fast-import mark"); - len = read(tmpfd, buf, sizeof(buf)); - if (len < 0) - die_errno("index-stream: reading fast-import mark"); - if (close(tmpfd) < 0) - die_errno("index-stream: closing fast-import mark"); - if (unlink(tmpfile)) - die_errno("index-stream: unlinking fast-import mark"); - if (len != 44 || - memcmp(":1 ", buf, 3) || - get_sha1_hex(buf + 3, sha1)) - die_errno("index-stream: unexpected fast-import mark: <%s>", buf); - return 0; + return index_bulk_checkin(sha1, fd, size, type, path, flags); } int index_fd(unsigned char *sha1, int fd, struct stat *st, diff --git a/t/t1050-large.sh b/t/t1050-large.sh index deba111bd7..29d6024b7f 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -7,21 +7,97 @@ test_description='adding and checking out large blobs' test_expect_success setup ' git config core.bigfilethreshold 200k && - echo X | dd of=large bs=1k seek=2000 + echo X | dd of=large1 bs=1k seek=2000 && + echo X | dd of=large2 bs=1k seek=2000 && + echo X | dd of=large3 bs=1k seek=2000 && + echo Y | dd of=huge bs=1k seek=2500 ' -test_expect_success 'add a large file' ' - git add large && - # make sure we got a packfile and no loose objects - test -f .git/objects/pack/pack-*.pack && - test ! -f .git/objects/??/?????????????????????????????????????? +test_expect_success 'add a large file or two' ' + git add large1 huge large2 && + # make sure we got a single packfile and no loose objects + bad= count=0 idx= && + for p in .git/objects/pack/pack-*.pack + do + count=$(( $count + 1 )) + if test -f "$p" && idx=${p%.pack}.idx && test -f "$idx" + then + continue + fi + bad=t + done && + test -z "$bad" && + test $count = 1 && + cnt=$(git show-index <"$idx" | wc -l) && + test $cnt = 2 && + for l in .git/objects/??/?????????????????????????????????????? + do + test -f "$l" || continue + bad=t + done && + test -z "$bad" && + + # attempt to add another copy of the same + git add large3 && + bad= count=0 && + for p in .git/objects/pack/pack-*.pack + do + count=$(( $count + 1 )) + if test -f "$p" && idx=${p%.pack}.idx && test -f "$idx" + then + continue + fi + bad=t + done && + test -z "$bad" && + test $count = 1 ' test_expect_success 'checkout a large file' ' - large=$(git rev-parse :large) && - git update-index --add --cacheinfo 100644 $large another && + large1=$(git rev-parse :large1) && + git update-index --add --cacheinfo 100644 $large1 another && git checkout another && - cmp large another ;# this must not be test_cmp + cmp large1 another ;# this must not be test_cmp +' + +test_expect_success 'packsize limit' ' + test_create_repo mid && + ( + cd mid && + git config core.bigfilethreshold 64k && + git config pack.packsizelimit 256k && + + # mid1 and mid2 will fit within 256k limit but + # appending mid3 will bust the limit and will + # result in a separate packfile. + test-genrandom "a" $(( 66 * 1024 )) >mid1 && + test-genrandom "b" $(( 80 * 1024 )) >mid2 && + test-genrandom "c" $(( 128 * 1024 )) >mid3 && + git add mid1 mid2 mid3 && + + count=0 + for pi in .git/objects/pack/pack-*.idx + do + test -f "$pi" && count=$(( $count + 1 )) + done && + test $count = 2 && + + ( + git hash-object --stdin expect && + + for pi in .git/objects/pack/pack-*.idx + do + git show-index <"$pi" + done | + sed -e "s/^[0-9]* \([0-9a-f]*\) .*/\1/" | + sort >actual && + + test_cmp expect actual + ) ' test_done diff --git a/zlib.c b/zlib.c index 3c63d480c7..2b2c0c780e 100644 --- a/zlib.c +++ b/zlib.c @@ -188,13 +188,20 @@ void git_deflate_init_gzip(git_zstream *strm, int level) strm->z.msg ? strm->z.msg : "no message"); } -void git_deflate_end(git_zstream *strm) +int git_deflate_abort(git_zstream *strm) { int status; zlib_pre_call(strm); status = deflateEnd(&strm->z); zlib_post_call(strm); + return status; +} + +void git_deflate_end(git_zstream *strm) +{ + int status = git_deflate_abort(strm); + if (status == Z_OK) return; error("deflateEnd: %s (%s)", zerr_to_string(status), -- cgit v1.2.3