Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.kernel.org/pub/scm/git/git.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/config/core.txt33
-rw-r--r--builtin/unpack-objects.c106
-rw-r--r--object-file.c233
-rw-r--r--object-store.h8
-rwxr-xr-xt/t5351-unpack-large-objects.sh76
5 files changed, 405 insertions, 51 deletions
diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index 41e330f306..3ea3124f7f 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -444,17 +444,32 @@ You probably do not need to adjust this value.
Common unit suffixes of 'k', 'm', or 'g' are supported.
core.bigFileThreshold::
- Files larger than this size are stored deflated, without
- attempting delta compression. Storing large files without
- delta compression avoids excessive memory usage, at the
- slight expense of increased disk usage. Additionally files
- larger than this size are always treated as binary.
+ The size of files considered "big", which as discussed below
+ changes the behavior of numerous git commands, as well as how
+ such files are stored within the repository. The default is
+ 512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
+ supported.
+
-Default is 512 MiB on all platforms. This should be reasonable
-for most projects as source code and other text files can still
-be delta compressed, but larger binary media files won't be.
+Files above the configured limit will be:
+
-Common unit suffixes of 'k', 'm', or 'g' are supported.
+* Stored deflated in packfiles, without attempting delta compression.
++
+The default limit is primarily set with this use-case in mind. With it,
+most projects will have their source code and other text files delta
+compressed, but not larger binary media files.
++
+Storing large files without delta compression avoids excessive memory
+usage, at the slight expense of increased disk usage.
++
+* Will be treated as if they were labeled "binary" (see
+ linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and
+ linkgit:git-diff[1] will not compute diffs for files above this limit.
++
+* Will generally be streamed when written, which avoids excessive
+memory usage, at the cost of some fixed overhead. Commands that make
+use of this include linkgit:git-archive[1],
+linkgit:git-fast-import[1], linkgit:git-index-pack[1],
+linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
core.excludesFile::
Specifies the pathname to the file that contains patterns to
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 56d05e2725..43789b8ef2 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -97,15 +97,27 @@ static void use(int bytes)
display_throughput(progress, consumed_bytes);
}
+/*
+ * Decompress zstream from the standard input into a newly
+ * allocated buffer of specified size and return the buffer.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
static void *get_data(unsigned long size)
{
git_zstream stream;
- void *buf = xmallocz(size);
+ unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
+ void *buf = xmallocz(bufsize);
memset(&stream, 0, sizeof(stream));
stream.next_out = buf;
- stream.avail_out = size;
+ stream.avail_out = bufsize;
stream.next_in = fill(1);
stream.avail_in = len;
git_inflate_init(&stream);
@@ -125,8 +137,17 @@ static void *get_data(unsigned long size)
}
stream.next_in = fill(1);
stream.avail_in = len;
+ if (dry_run) {
+ /* reuse the buffer in dry_run mode */
+ stream.next_out = buf;
+ stream.avail_out = bufsize > size - stream.total_out ?
+ size - stream.total_out :
+ bufsize;
+ }
}
git_inflate_end(&stream);
+ if (dry_run)
+ FREE_AND_NULL(buf);
return buf;
}
@@ -326,10 +347,70 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
{
void *buf = get_data(size);
- if (!dry_run && buf)
+ if (buf)
write_object(nr, type, buf, size);
- else
- free(buf);
+}
+
+struct input_zstream_data {
+ git_zstream *zstream;
+ unsigned char buf[8192];
+ int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+ unsigned long *readlen)
+{
+ struct input_zstream_data *data = in_stream->data;
+ git_zstream *zstream = data->zstream;
+ void *in = fill(1);
+
+ if (in_stream->is_finished) {
+ *readlen = 0;
+ return NULL;
+ }
+
+ zstream->next_out = data->buf;
+ zstream->avail_out = sizeof(data->buf);
+ zstream->next_in = in;
+ zstream->avail_in = len;
+
+ data->status = git_inflate(zstream, 0);
+
+ in_stream->is_finished = data->status != Z_OK;
+ use(len - zstream->avail_in);
+ *readlen = sizeof(data->buf) - zstream->avail_out;
+
+ return data->buf;
+}
+
+static void stream_blob(unsigned long size, unsigned nr)
+{
+ git_zstream zstream = { 0 };
+ struct input_zstream_data data = { 0 };
+ struct input_stream in_stream = {
+ .read = feed_input_zstream,
+ .data = &data,
+ };
+ struct obj_info *info = &obj_list[nr];
+
+ data.zstream = &zstream;
+ git_inflate_init(&zstream);
+
+ if (stream_loose_object(&in_stream, size, &info->oid))
+ die(_("failed to write object in stream"));
+
+ if (data.status != Z_STREAM_END)
+ die(_("inflate returned (%d)"), data.status);
+ git_inflate_end(&zstream);
+
+ if (strict) {
+ struct blob *blob = lookup_blob(the_repository, &info->oid);
+
+ if (!blob)
+ die(_("invalid blob object from stream"));
+ blob->object.flags |= FLAG_WRITTEN;
+ }
+ info->obj = NULL;
}
static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -359,10 +440,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
oidread(&base_oid, fill(the_hash_algo->rawsz));
use(the_hash_algo->rawsz);
delta_data = get_data(delta_size);
- if (dry_run || !delta_data) {
- free(delta_data);
+ if (!delta_data)
return;
- }
if (has_object_file(&base_oid))
; /* Ok we have this one */
else if (resolve_against_held(nr, &base_oid,
@@ -398,10 +477,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
die("offset value out of bound for delta base object");
delta_data = get_data(delta_size);
- if (dry_run || !delta_data) {
- free(delta_data);
+ if (!delta_data)
return;
- }
lo = 0;
hi = nr;
while (lo < hi) {
@@ -468,9 +545,14 @@ static void unpack_one(unsigned nr)
}
switch (type) {
+ case OBJ_BLOB:
+ if (!dry_run && size > big_file_threshold) {
+ stream_blob(size, nr);
+ return;
+ }
+ /* fallthrough */
case OBJ_COMMIT:
case OBJ_TREE:
- case OBJ_BLOB:
case OBJ_TAG:
unpack_non_delta_entry(type, size, nr);
return;
diff --git a/object-file.c b/object-file.c
index 6c8e3b1660..5b270f046d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1951,6 +1951,96 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
return fd;
}
+/**
+ * Common steps for loose object writers to start writing loose
+ * objects:
+ *
+ * - Create tmpfile for the loose object.
+ * - Setup zlib stream for compression.
+ * - Start to feed header to zlib stream.
+ *
+ * Returns a "fd", which should later be provided to
+ * end_loose_object_common().
+ */
+static int start_loose_object_common(struct strbuf *tmp_file,
+ const char *filename, unsigned flags,
+ git_zstream *stream,
+ unsigned char *buf, size_t buflen,
+ git_hash_ctx *c,
+ char *hdr, int hdrlen)
+{
+ int fd;
+
+ fd = create_tmpfile(tmp_file, filename);
+ if (fd < 0) {
+ if (flags & HASH_SILENT)
+ return -1;
+ else if (errno == EACCES)
+ return error(_("insufficient permission for adding "
+ "an object to repository database %s"),
+ get_object_directory());
+ else
+ return error_errno(
+ _("unable to create temporary file"));
+ }
+
+ /* Setup zlib stream for compression */
+ git_deflate_init(stream, zlib_compression_level);
+ stream->next_out = buf;
+ stream->avail_out = buflen;
+ the_hash_algo->init_fn(c);
+
+ /* Start to feed header to zlib stream */
+ stream->next_in = (unsigned char *)hdr;
+ stream->avail_in = hdrlen;
+ while (git_deflate(stream, 0) == Z_OK)
+ ; /* nothing */
+ the_hash_algo->update_fn(c, hdr, hdrlen);
+
+ return fd;
+}
+
+/**
+ * Common steps for the inner git_deflate() loop for writing loose
+ * objects. Returns what git_deflate() returns.
+ */
+static int write_loose_object_common(git_hash_ctx *c,
+ git_zstream *stream, const int flush,
+ unsigned char *in0, const int fd,
+ unsigned char *compressed,
+ const size_t compressed_len)
+{
+ int ret;
+
+ ret = git_deflate(stream, flush ? Z_FINISH : 0);
+ the_hash_algo->update_fn(c, in0, stream->next_in - in0);
+ if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
+ die(_("unable to write loose object file"));
+ stream->next_out = compressed;
+ stream->avail_out = compressed_len;
+
+ return ret;
+}
+
+/**
+ * Common steps for loose object writers to end writing loose objects:
+ *
+ * - End the compression of zlib stream.
+ * - Get the calculated oid to "oid".
+ */
+static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
+ struct object_id *oid)
+{
+ int ret;
+
+ ret = git_deflate_end_gently(stream);
+ if (ret != Z_OK)
+ return ret;
+ the_hash_algo->final_oid_fn(oid, c);
+
+ return Z_OK;
+}
+
static int write_loose_object(const struct object_id *oid, char *hdr,
int hdrlen, const void *buf, unsigned long len,
time_t mtime, unsigned flags)
@@ -1968,50 +2058,29 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
loose_object_path(the_repository, &filename, oid);
- fd = create_tmpfile(&tmp_file, filename.buf);
- if (fd < 0) {
- if (flags & HASH_SILENT)
- return -1;
- else if (errno == EACCES)
- return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
- else
- return error_errno(_("unable to create temporary file"));
- }
-
- /* Set it up */
- git_deflate_init(&stream, zlib_compression_level);
- stream.next_out = compressed;
- stream.avail_out = sizeof(compressed);
- the_hash_algo->init_fn(&c);
-
- /* First header.. */
- stream.next_in = (unsigned char *)hdr;
- stream.avail_in = hdrlen;
- while (git_deflate(&stream, 0) == Z_OK)
- ; /* nothing */
- the_hash_algo->update_fn(&c, hdr, hdrlen);
+ fd = start_loose_object_common(&tmp_file, filename.buf, flags,
+ &stream, compressed, sizeof(compressed),
+ &c, hdr, hdrlen);
+ if (fd < 0)
+ return -1;
/* Then the data itself.. */
stream.next_in = (void *)buf;
stream.avail_in = len;
do {
unsigned char *in0 = stream.next_in;
- ret = git_deflate(&stream, Z_FINISH);
- the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
- if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
- die(_("unable to write loose object file"));
- stream.next_out = compressed;
- stream.avail_out = sizeof(compressed);
+
+ ret = write_loose_object_common(&c, &stream, 1, in0, fd,
+ compressed, sizeof(compressed));
} while (ret == Z_OK);
if (ret != Z_STREAM_END)
die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
ret);
- ret = git_deflate_end_gently(&stream);
+ ret = end_loose_object_common(&c, &stream, &parano_oid);
if (ret != Z_OK)
die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
ret);
- the_hash_algo->final_oid_fn(&parano_oid, &c);
if (!oideq(oid, &parano_oid))
die(_("confused by unstable object source data for %s"),
oid_to_hex(oid));
@@ -2050,6 +2119,110 @@ static int freshen_packed_object(const struct object_id *oid)
return 1;
}
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+ struct object_id *oid)
+{
+ int fd, ret, err = 0, flush = 0;
+ unsigned char compressed[4096];
+ git_zstream stream;
+ git_hash_ctx c;
+ struct strbuf tmp_file = STRBUF_INIT;
+ struct strbuf filename = STRBUF_INIT;
+ int dirlen;
+ char hdr[MAX_HEADER_LEN];
+ int hdrlen;
+
+ if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
+ prepare_loose_object_bulk_checkin();
+
+ /* Since oid is not determined, save tmp file to odb path. */
+ strbuf_addf(&filename, "%s/", get_object_directory());
+ hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
+
+ /*
+ * Common steps for write_loose_object and stream_loose_object to
+ * start writing loose objects:
+ *
+ * - Create tmpfile for the loose object.
+ * - Setup zlib stream for compression.
+ * - Start to feed header to zlib stream.
+ */
+ fd = start_loose_object_common(&tmp_file, filename.buf, 0,
+ &stream, compressed, sizeof(compressed),
+ &c, hdr, hdrlen);
+ if (fd < 0) {
+ err = -1;
+ goto cleanup;
+ }
+
+ /* Then the data itself.. */
+ do {
+ unsigned char *in0 = stream.next_in;
+
+ if (!stream.avail_in && !in_stream->is_finished) {
+ const void *in = in_stream->read(in_stream, &stream.avail_in);
+ stream.next_in = (void *)in;
+ in0 = (unsigned char *)in;
+ /* All data has been read. */
+ if (in_stream->is_finished)
+ flush = 1;
+ }
+ ret = write_loose_object_common(&c, &stream, flush, in0, fd,
+ compressed, sizeof(compressed));
+ /*
+ * Unlike write_loose_object(), we do not have the entire
+ * buffer. If we get Z_BUF_ERROR due to too few input bytes,
+ * then we'll replenish them in the next input_stream->read()
+ * call when we loop.
+ */
+ } while (ret == Z_OK || ret == Z_BUF_ERROR);
+
+ if (stream.total_in != len + hdrlen)
+ die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
+ (uintmax_t)len + hdrlen);
+
+ /*
+ * Common steps for write_loose_object and stream_loose_object to
+ * end writing loose oject:
+ *
+ * - End the compression of zlib stream.
+ * - Get the calculated oid.
+ */
+ if (ret != Z_STREAM_END)
+ die(_("unable to stream deflate new object (%d)"), ret);
+ ret = end_loose_object_common(&c, &stream, oid);
+ if (ret != Z_OK)
+ die(_("deflateEnd on stream object failed (%d)"), ret);
+ close_loose_object(fd, tmp_file.buf);
+
+ if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
+ unlink_or_warn(tmp_file.buf);
+ goto cleanup;
+ }
+
+ loose_object_path(the_repository, &filename, oid);
+
+ /* We finally know the object path, and create the missing dir. */
+ dirlen = directory_size(filename.buf);
+ if (dirlen) {
+ struct strbuf dir = STRBUF_INIT;
+ strbuf_add(&dir, filename.buf, dirlen);
+
+ if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
+ err = error_errno(_("unable to create directory %s"), dir.buf);
+ strbuf_release(&dir);
+ goto cleanup;
+ }
+ strbuf_release(&dir);
+ }
+
+ err = finalize_object_file(tmp_file.buf, filename.buf);
+cleanup:
+ strbuf_release(&tmp_file);
+ strbuf_release(&filename);
+ return err;
+}
+
int write_object_file_flags(const void *buf, unsigned long len,
enum object_type type, struct object_id *oid,
unsigned flags)
diff --git a/object-store.h b/object-store.h
index 539ea43904..5222ee5460 100644
--- a/object-store.h
+++ b/object-store.h
@@ -46,6 +46,12 @@ struct object_directory {
char *path;
};
+struct input_stream {
+ const void *(*read)(struct input_stream *, unsigned long *len);
+ void *data;
+ int is_finished;
+};
+
KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
struct object_directory *, 1, fspathhash, fspatheq)
@@ -269,6 +275,8 @@ static inline int write_object_file(const void *buf, unsigned long len,
int write_object_file_literally(const void *buf, unsigned long len,
const char *type, struct object_id *oid,
unsigned flags);
+int stream_loose_object(struct input_stream *in_stream, size_t len,
+ struct object_id *oid);
/*
* Add an object file to the in-memory object store, without writing it
diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh
new file mode 100755
index 0000000000..8ce8aa3b14
--- /dev/null
+++ b/t/t5351-unpack-large-objects.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# Copyright (c) 2022 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+ test_when_finished "rm -rf dest.git" &&
+ git init --bare dest.git &&
+ git -C dest.git config core.bigFileThreshold "$1"
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+ test-tool genrandom foo 1500000 >big-blob &&
+ test_commit --append foo big-blob &&
+ test-tool genrandom bar 1500000 >big-blob &&
+ test_commit --append bar big-blob &&
+ PACK=$(echo HEAD | git pack-objects --revs pack) &&
+ git verify-pack -v pack-$PACK.pack >out &&
+ sed -n -e "s/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\).*/\1/p" \
+ <out >obj-list
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+ GIT_ALLOC_LIMIT=1m &&
+ export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+ prepare_dest 2m &&
+ test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
+ grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+ prepare_dest 2m &&
+ git -C dest.git unpack-objects -n <pack-$PACK.pack &&
+ test_stdout_line_count = 0 find dest.git/objects -type f &&
+ test_dir_is_empty dest.git/objects/pack
+'
+
+test_expect_success 'unpack big object in stream' '
+ prepare_dest 1m &&
+ git -C dest.git unpack-objects <pack-$PACK.pack &&
+ test_dir_is_empty dest.git/objects/pack
+'
+
+BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
+
+test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
+ prepare_dest 1m &&
+ GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
+ git -C dest.git $BATCH_CONFIGURATION unpack-objects <pack-$PACK.pack &&
+ grep fsync/hardware-flush trace2.txt &&
+ test_dir_is_empty dest.git/objects/pack &&
+ git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
+ cmp obj-list current
+'
+
+test_expect_success 'do not unpack existing large objects' '
+ prepare_dest 1m &&
+ git -C dest.git index-pack --stdin <pack-$PACK.pack &&
+ git -C dest.git unpack-objects <pack-$PACK.pack &&
+
+ # The destination came up with the exact same pack...
+ DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
+ test_cmp pack-$PACK.pack $DEST_PACK &&
+
+ # ...and wrote no loose objects
+ test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
+'
+
+test_done