Merge branch 'mc/clean-smudge-with-llp64'

The clean/smudge conversion code path has been prepared to better work on platforms where ulong is narrower than size_t. * mc/clean-smudge-with-llp64: clean/smudge: allow clean filters to process extremely large files odb: guard against data loss checking out a huge file git-compat-util: introduce more size_t helpers odb: teach read_blob_entry to use size_t t1051: introduce a smudge filter test for extremely large files test-lib: add prerequisite for 64-bit platforms test-tool genzeros: generate large amounts of data more efficiently test-genzeros: allow more than 2G zeros in Windows
author: Junio C Hamano <gitster@pobox.com> 2021-11-30 02:41:51 +0300
committer: Junio C Hamano <gitster@pobox.com> 2021-11-30 02:41:51 +0300
commit: f9ba6acaa9348ea7b733bf78adc2f084247a912f (patch)
tree: bd702b925cea098b8e04bd49272c5faba336bf15
parent: ad1260b6c994f7c0f9c259bd39f39979f7f4ecc2 (diff)
parent: 596b5e77c960cc57ad2e68407b298411ec5e8cb8 (diff)
11 files changed, 89 insertions, 19 deletions
diff --git a/convert.c b/convert.c
index 0d6fb3410a..df7186bd81 100644
--- a/convert.c
+++ b/convert.c
@@ -613,7 +613,7 @@ static int crlf_to_worktree(const char *src, size_t len, struct strbuf *buf,
 
 struct filter_params {
 	const char *src;
-	unsigned long size;
+	size_t size;
 	int fd;
 	const char *cmd;
 	const char *path;
diff --git a/delta.h b/delta.h
index 2df5fe13d9..8a56ec0799 100644
--- a/delta.h
+++ b/delta.h
@@ -90,15 +90,15 @@ static inline unsigned long get_delta_hdr_size(const unsigned char **datap,
 					       const unsigned char *top)
 {
 	const unsigned char *data = *datap;
-	unsigned long cmd, size = 0;
+	size_t cmd, size = 0;
 	int i = 0;
 	do {
 		cmd = *data++;
-		size |= (cmd & 0x7f) << i;
+		size |= st_left_shift(cmd & 0x7f, i);
 		i += 7;
 	} while (cmd & 0x80 && data < top);
 	*datap = data;
-	return size;
+	return cast_size_t_to_ulong(size);
 }
 
 #endif
diff --git a/entry.c b/entry.c
index 9b0f968a70..1c9df62b30 100644
--- a/entry.c
+++ b/entry.c
@@ -82,11 +82,13 @@ static int create_file(const char *path, unsigned int mode)
 	return open(path, O_WRONLY | O_CREAT | O_EXCL, mode);
 }
 
-void *read_blob_entry(const struct cache_entry *ce, unsigned long *size)
+void *read_blob_entry(const struct cache_entry *ce, size_t *size)
 {
 	enum object_type type;
-	void *blob_data = read_object_file(&ce->oid, &type, size);
+	unsigned long ul;
+	void *blob_data = read_object_file(&ce->oid, &type, &ul);
 
+	*size = ul;
 	if (blob_data) {
 		if (type == OBJ_BLOB)
 			return blob_data;
@@ -271,7 +273,7 @@ static int write_entry(struct cache_entry *ce, char *path, struct conv_attrs *ca
 	int fd, ret, fstat_done = 0;
 	char *new_blob;
 	struct strbuf buf = STRBUF_INIT;
-	unsigned long size;
+	size_t size;
 	ssize_t wrote;
 	size_t newsize = 0;
 	struct stat st;
diff --git a/entry.h b/entry.h
index 2254c62727..252fd24c2e 100644
--- a/entry.h
+++ b/entry.h
@@ -52,7 +52,7 @@ int finish_delayed_checkout(struct checkout *state, int *nr_checkouts,
  */
 void unlink_entry(const struct cache_entry *ce);
 
-void *read_blob_entry(const struct cache_entry *ce, unsigned long *size);
+void *read_blob_entry(const struct cache_entry *ce, size_t *size);
 int fstat_checkout_output(int fd, const struct checkout *state, struct stat *st);
 void update_ce_after_write(const struct checkout *state, struct cache_entry *ce,
 			   struct stat *st);
diff --git a/git-compat-util.h b/git-compat-util.h
index 7176a43381..c6bd2a84e5 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -113,6 +113,14 @@
 #define unsigned_mult_overflows(a, b) \
     ((a) && (b) > maximum_unsigned_value_of_type(a) / (a))
 
+/*
+ * Returns true if the left shift of "a" by "shift" bits will
+ * overflow. The type of "a" must be unsigned.
+ */
+#define unsigned_left_shift_overflows(a, shift) \
+    ((shift) < bitsizeof(a) && \
+     (a) > maximum_unsigned_value_of_type(a) >> (shift))
+
 #ifdef __GNUC__
 #define TYPEOF(x) (__typeof__(x))
 #else
@@ -862,6 +870,23 @@ static inline size_t st_sub(size_t a, size_t b)
 	return a - b;
 }
 
+static inline size_t st_left_shift(size_t a, unsigned shift)
+{
+	if (unsigned_left_shift_overflows(a, shift))
+		die("size_t overflow: %"PRIuMAX" << %u",
+		    (uintmax_t)a, shift);
+	return a << shift;
+}
+
+static inline unsigned long cast_size_t_to_ulong(size_t a)
+{
+	if (a != (unsigned long)a)
+		die("object too large to read on this platform: %"
+		    PRIuMAX" is cut off to %lu",
+		    (uintmax_t)a, (unsigned long)a);
+	return (unsigned long)a;
+}
+
 #ifdef HAVE_ALLOCA_H
 # include <alloca.h>
 # define xalloca(size)      (alloca(size))
diff --git a/object-file.c b/object-file.c
index c3d866a287..eb972cdccd 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1306,7 +1306,7 @@ static void *unpack_loose_rest(git_zstream *stream,
 int parse_loose_header(const char *hdr, struct object_info *oi)
 {
 	const char *type_buf = hdr;
-	unsigned long size;
+	size_t size;
 	int type, type_len = 0;
 
 	/*
@@ -1341,12 +1341,12 @@ int parse_loose_header(const char *hdr, struct object_info *oi)
 			if (c > 9)
 				break;
 			hdr++;
-			size = size * 10 + c;
+			size = st_add(st_mult(size, 10), c);
 		}
 	}
 
 	if (oi->sizep)
-		*oi->sizep = size;
+		*oi->sizep = cast_size_t_to_ulong(size);
 
 	/*
 	 * The length must be followed by a zero byte
diff --git a/packfile.c b/packfile.c
index 89402cfc69..6423d77faa 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1060,7 +1060,7 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf,
 		unsigned long len, enum object_type *type, unsigned long *sizep)
 {
 	unsigned shift;
-	unsigned long size, c;
+	size_t size, c;
 	unsigned long used = 0;
 
 	c = buf[used++];
@@ -1074,10 +1074,10 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf,
 			break;
 		}
 		c = buf[used++];
-		size += (c & 0x7f) << shift;
+		size = st_add(size, st_left_shift(c & 0x7f, shift));
 		shift += 7;
 	}
-	*sizep = size;
+	*sizep = cast_size_t_to_ulong(size);
 	return used;
 }
 
diff --git a/parallel-checkout.c b/parallel-checkout.c
index ed9c999520..8dd7e7bad4 100644
--- a/parallel-checkout.c
+++ b/parallel-checkout.c
@@ -261,7 +261,7 @@ static int write_pc_item_to_fd(struct parallel_checkout_item *pc_item, int fd,
 	struct stream_filter *filter;
 	struct strbuf buf = STRBUF_INIT;
 	char *blob;
-	unsigned long size;
+	size_t size;
 	ssize_t wrote;
 
 	/* Sanity check */
diff --git a/t/helper/test-genzeros.c b/t/helper/test-genzeros.c
index 9532f5bac9..8ca988d621 100644
--- a/t/helper/test-genzeros.c
+++ b/t/helper/test-genzeros.c
@@ -3,18 +3,31 @@
 
 int cmd__genzeros(int argc, const char **argv)
 {
-	long count;
+	/* static, so that it is NUL-initialized */
+	static const char zeros[256 * 1024];
+	intmax_t count;
+	ssize_t n;
 
 	if (argc > 2) {
 		fprintf(stderr, "usage: %s [<count>]\n", argv[0]);
 		return 1;
 	}
 
-	count = argc > 1 ? strtol(argv[1], NULL, 0) : -1L;
+	count = argc > 1 ? strtoimax(argv[1], NULL, 0) : -1;
 
-	while (count < 0 || count--) {
-		if (putchar(0) == EOF)
+	/* Writing out individual NUL bytes is slow... */
+	while (count < 0)
+		if (write(1, zeros, ARRAY_SIZE(zeros)) < 0)
 			return -1;
+
+	while (count > 0) {
+		n = write(1, zeros, count < ARRAY_SIZE(zeros) ?
+			  count : ARRAY_SIZE(zeros));
+
+		if (n < 0)
+			return -1;
+
+		count -= n;
 	}
 
 	return 0;
diff --git a/t/t1051-large-conversion.sh b/t/t1051-large-conversion.sh
index 8b7640b3ba..042b0e4429 100755
--- a/t/t1051-large-conversion.sh
+++ b/t/t1051-large-conversion.sh
@@ -83,4 +83,30 @@ test_expect_success 'ident converts on output' '
 	test_cmp small.clean large.clean
 '
 
+# This smudge filter prepends 5GB of zeros to the file it checks out. This
+# ensures that smudging doesn't mangle large files on 64-bit Windows.
+test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \
+		'files over 4GB convert on output' '
+	test_commit test small "a small file" &&
+	small_size=$(test_file_size small) &&
+	test_config filter.makelarge.smudge \
+		"test-tool genzeros $((5*1024*1024*1024)) && cat" &&
+	echo "small filter=makelarge" >.gitattributes &&
+	rm small &&
+	git checkout -- small &&
+	size=$(test_file_size small) &&
+	test "$size" -eq $((5 * 1024 * 1024 * 1024 + $small_size))
+'
+
+# This clean filter writes down the size of input it receives. By checking against
+# the actual size, we ensure that cleaning doesn't mangle large files on 64-bit Windows.
+test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \
+		'files over 4GB convert on input' '
+	test-tool genzeros $((5*1024*1024*1024)) >big &&
+	test_config filter.checklarge.clean "wc -c >big.size" &&
+	echo "big filter=checklarge" >.gitattributes &&
+	git add big &&
+	test $(test_file_size big) -eq $(cat big.size)
+'
+
 test_done
diff --git a/t/test-lib.sh b/t/test-lib.sh
index 2679a7596a..57efcc5e97 100644
--- a/t/test-lib.sh
+++ b/t/test-lib.sh
@@ -1734,6 +1734,10 @@ build_option () {
 	sed -ne "s/^$1: //p"
 }
 
+test_lazy_prereq SIZE_T_IS_64BIT '
+	test 8 -eq "$(build_option sizeof-size_t)"
+'
+
 test_lazy_prereq LONG_IS_64BIT '
 	test 8 -le "$(build_option sizeof-long)"
 '
author	Junio C Hamano <gitster@pobox.com>	2021-11-30 02:41:51 +0300
committer	Junio C Hamano <gitster@pobox.com>	2021-11-30 02:41:51 +0300
commit	f9ba6acaa9348ea7b733bf78adc2f084247a912f (patch)
tree	bd702b925cea098b8e04bd49272c5faba336bf15
parent	ad1260b6c994f7c0f9c259bd39f39979f7f4ecc2 (diff)
parent	596b5e77c960cc57ad2e68407b298411ec5e8cb8 (diff)