From 4c2db93807f5ab65976a901b562e4bc8d69d40bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:34:59 +0200 Subject: read-cache.c: make $GIT_TEST_SPLIT_INDEX boolean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While at there, document about this special mode when running the test suite. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- ci/run-build-and-tests.sh | 2 +- read-cache.c | 4 ++-- t/README | 11 +++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh index 3735ce413f1..8190a753de3 100755 --- a/ci/run-build-and-tests.sh +++ b/ci/run-build-and-tests.sh @@ -11,7 +11,7 @@ make --jobs=2 make --quiet test if test "$jobname" = "linux-gcc" then - GIT_TEST_SPLIT_INDEX=YesPlease make --quiet test + GIT_TEST_SPLIT_INDEX=yes make --quiet test fi check_unignored_build_artifacts diff --git a/read-cache.c b/read-cache.c index 10f1c6bb8a3..fa3df2e72e0 100644 --- a/read-cache.c +++ b/read-cache.c @@ -2268,7 +2268,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, if (!istate->version) { istate->version = get_index_format_default(); - if (getenv("GIT_TEST_SPLIT_INDEX")) + if (git_env_bool("GIT_TEST_SPLIT_INDEX", 0)) init_split_index(istate); } @@ -2559,7 +2559,7 @@ int write_locked_index(struct index_state *istate, struct lock_file *lock, goto out; } - if (getenv("GIT_TEST_SPLIT_INDEX")) { + if (git_env_bool("GIT_TEST_SPLIT_INDEX", 0)) { int v = si->base_sha1[0]; if ((v & 15) < 6) istate->cache_changed |= SPLIT_INDEX_ORDERED; diff --git a/t/README b/t/README index 24ddebfabf9..d5e0a3c786d 100644 --- a/t/README +++ b/t/README @@ -293,6 +293,17 @@ and know what setup is needed for it. Or when you want to run everything up to a certain test. +Running tests with special setups +--------------------------------- + +The whole test suite could be run to test some special features +that cannot be easily covered by a few specific test cases. These +could be enabled by running the test suite with correct GIT_TEST_ +environment set. + +GIT_TEST_SPLIT_INDEX= forces split-index mode on the whole +test suite. Accept any boolean values that are accepted by git-config. + Naming Tests ------------ -- cgit v1.2.3 From 8d6ccce14fd1a5843f5c9b231d4dcb81f6ceeb25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:00 +0200 Subject: pack-objects: a bit of document about struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The role of this comment block becomes more important after we shuffle fields around to shrink this struct. It will be much harder to see what field is related to what. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pack-objects.h b/pack-objects.h index 03f1191659d..c0a1f61aac8 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -1,6 +1,51 @@ #ifndef PACK_OBJECTS_H #define PACK_OBJECTS_H +/* + * basic object info + * ----------------- + * idx.oid is filled up before delta searching starts. idx.crc32 is + * only valid after the object is written out and will be used for + * generating the index. idx.offset will be both gradually set and + * used in writing phase (base objects get offset first, then deltas + * refer to them) + * + * "size" is the uncompressed object size. Compressed size of the raw + * data for an object in a pack is not stored anywhere but is computed + * and made available when reverse .idx is made. + * + * "hash" contains a path name hash which is used for sorting the + * delta list and also during delta searching. Once prepare_pack() + * returns it's no longer needed. + * + * source pack info + * ---------------- + * The (in_pack, in_pack_offset) tuple contains the location of the + * object in the source pack. in_pack_header_size allows quickly + * skipping the header and going straight to the zlib stream. + * + * "type" and "in_pack_type" both describe object type. in_pack_type + * may contain a delta type, while type is always the canonical type. + * + * deltas + * ------ + * Delta links (delta, delta_child and delta_sibling) are created to + * reflect that delta graph from the source pack then updated or added + * during delta searching phase when we find better deltas. + * + * delta_child and delta_sibling are last needed in + * compute_write_order(). "delta" and "delta_size" must remain valid + * at object writing phase in case the delta is not cached. + * + * If a delta is cached in memory and is compressed, delta_data points + * to the data and z_delta_size contains the compressed size. If it's + * uncompressed [1], z_delta_size must be zero. delta_size is always + * the uncompressed size and must be valid even if the delta is not + * cached. + * + * [1] during try_delta phase we don't bother with compressing because + * the delta could be quickly replaced with a better one. + */ struct object_entry { struct pack_idx_entry idx; unsigned long size; /* uncompressed size */ -- cgit v1.2.3 From fd9b1baef8a940c9c251995b006a3d96f210e639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:01 +0200 Subject: pack-objects: turn type and in_pack_type to bitfields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An extra field type_valid is added to carry the equivalent of OBJ_BAD in the original "type" field. in_pack_type always contains a valid type so we only need 3 bits for it. A note about accepting OBJ_NONE as "valid" type. The function read_object_list_from_stdin() can pass this value [1] and it eventually calls create_object_entry() where current code skip setting "type" field if the incoming type is zero. This does not have any bad side effects because "type" field should be memset()'d anyway. But since we also need to set type_valid now, skipping oe_set_type() leaves type_valid zero/false, which will make oe_type() return OBJ_BAD, not OBJ_NONE anymore. Apparently we do care about OBJ_NONE in prepare_pack(). This switch from OBJ_NONE to OBJ_BAD may trigger fatal: unable to get type of object ... Accepting OBJ_NONE [2] does sound wrong, but this is how it is has been for a very long time and I haven't time to dig in further. [1] See 5c49c11686 (pack-objects: better check_object() performances - 2007-04-16) [2] 21666f1aae (convert object type handling from a string to a number - 2007-02-26) Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 59 ++++++++++++++++++++++++++++++-------------------- cache.h | 2 ++ object.h | 1 - pack-bitmap-write.c | 6 ++--- pack-objects.h | 20 +++++++++++++++-- 5 files changed, 58 insertions(+), 30 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 4bdae5a1d8f..88877f1f59d 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -266,7 +266,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent struct git_istream *st = NULL; if (!usable_delta) { - if (entry->type == OBJ_BLOB && + if (oe_type(entry) == OBJ_BLOB && entry->size > big_file_threshold && (st = open_istream(&entry->idx.oid, &type, &size, NULL)) != NULL) buf = NULL; @@ -371,7 +371,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, struct pack_window *w_curs = NULL; struct revindex_entry *revidx; off_t offset; - enum object_type type = entry->type; + enum object_type type = oe_type(entry); off_t datalen; unsigned char header[MAX_PACK_OBJECT_HEADER], dheader[MAX_PACK_OBJECT_HEADER]; @@ -480,11 +480,12 @@ static off_t write_object(struct hashfile *f, to_reuse = 0; /* explicit */ else if (!entry->in_pack) to_reuse = 0; /* can't reuse what we don't have */ - else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA) + else if (oe_type(entry) == OBJ_REF_DELTA || + oe_type(entry) == OBJ_OFS_DELTA) /* check_object() decided it for us ... */ to_reuse = usable_delta; /* ... but pack split may override that */ - else if (entry->type != entry->in_pack_type) + else if (oe_type(entry) != entry->in_pack_type) to_reuse = 0; /* pack has delta which is unusable */ else if (entry->delta) to_reuse = 0; /* we want to pack afresh */ @@ -705,8 +706,8 @@ static struct object_entry **compute_write_order(void) * And then all remaining commits and tags. */ for (i = last_untagged; i < to_pack.nr_objects; i++) { - if (objects[i].type != OBJ_COMMIT && - objects[i].type != OBJ_TAG) + if (oe_type(&objects[i]) != OBJ_COMMIT && + oe_type(&objects[i]) != OBJ_TAG) continue; add_to_write_order(wo, &wo_end, &objects[i]); } @@ -715,7 +716,7 @@ static struct object_entry **compute_write_order(void) * And then all the trees. */ for (i = last_untagged; i < to_pack.nr_objects; i++) { - if (objects[i].type != OBJ_TREE) + if (oe_type(&objects[i]) != OBJ_TREE) continue; add_to_write_order(wo, &wo_end, &objects[i]); } @@ -1066,8 +1067,7 @@ static void create_object_entry(const struct object_id *oid, entry = packlist_alloc(&to_pack, oid->hash, index_pos); entry->hash = hash; - if (type) - entry->type = type; + oe_set_type(entry, type); if (exclude) entry->preferred_base = 1; else @@ -1407,6 +1407,7 @@ static void check_object(struct object_entry *entry) unsigned long avail; off_t ofs; unsigned char *buf, c; + enum object_type type; buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail); @@ -1415,11 +1416,15 @@ static void check_object(struct object_entry *entry) * since non-delta representations could still be reused. */ used = unpack_object_header_buffer(buf, avail, - &entry->in_pack_type, + &type, &entry->size); if (used == 0) goto give_up; + if (type < 0) + BUG("invalid type %d", type); + entry->in_pack_type = type; + /* * Determine if this is a delta and if so whether we can * reuse it or not. Otherwise let's find out as cheaply as @@ -1428,9 +1433,9 @@ static void check_object(struct object_entry *entry) switch (entry->in_pack_type) { default: /* Not a delta hence we've already got all we need. */ - entry->type = entry->in_pack_type; + oe_set_type(entry, entry->in_pack_type); entry->in_pack_header_size = used; - if (entry->type < OBJ_COMMIT || entry->type > OBJ_BLOB) + if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB) goto give_up; unuse_pack(&w_curs); return; @@ -1484,7 +1489,7 @@ static void check_object(struct object_entry *entry) * deltify other objects against, in order to avoid * circular deltas. */ - entry->type = entry->in_pack_type; + oe_set_type(entry, entry->in_pack_type); entry->delta = base_entry; entry->delta_size = entry->size; entry->delta_sibling = base_entry->delta_child; @@ -1493,7 +1498,7 @@ static void check_object(struct object_entry *entry) return; } - if (entry->type) { + if (oe_type(entry)) { /* * This must be a delta and we already know what the * final object type is. Let's extract the actual @@ -1516,7 +1521,7 @@ static void check_object(struct object_entry *entry) unuse_pack(&w_curs); } - entry->type = oid_object_info(&entry->idx.oid, &entry->size); + oe_set_type(entry, oid_object_info(&entry->idx.oid, &entry->size)); /* * The error condition is checked in prepare_pack(). This is * to permit a missing preferred base object to be ignored @@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry) { struct object_entry **p = &entry->delta->delta_child; struct object_info oi = OBJECT_INFO_INIT; + enum object_type type; while (*p) { if (*p == entry) @@ -1570,15 +1576,18 @@ static void drop_reused_delta(struct object_entry *entry) entry->depth = 0; oi.sizep = &entry->size; - oi.typep = &entry->type; + oi.typep = &type; if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) { /* * We failed to get the info from this pack for some reason; * fall back to sha1_object_info, which may find another copy. - * And if that fails, the error will be recorded in entry->type + * And if that fails, the error will be recorded in oe_type(entry) * and dealt with in prepare_pack(). */ - entry->type = oid_object_info(&entry->idx.oid, &entry->size); + oe_set_type(entry, oid_object_info(&entry->idx.oid, + &entry->size)); + } else { + oe_set_type(entry, type); } } @@ -1746,10 +1755,12 @@ static int type_size_sort(const void *_a, const void *_b) { const struct object_entry *a = *(struct object_entry **)_a; const struct object_entry *b = *(struct object_entry **)_b; + enum object_type a_type = oe_type(a); + enum object_type b_type = oe_type(b); - if (a->type > b->type) + if (a_type > b_type) return -1; - if (a->type < b->type) + if (a_type < b_type) return 1; if (a->hash > b->hash) return -1; @@ -1825,7 +1836,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, void *delta_buf; /* Don't bother doing diffs between different types */ - if (trg_entry->type != src_entry->type) + if (oe_type(trg_entry) != oe_type(src_entry)) return -1; /* @@ -2429,11 +2440,11 @@ static void prepare_pack(int window, int depth) if (!entry->preferred_base) { nr_deltas++; - if (entry->type < 0) + if (oe_type(entry) < 0) die("unable to get type of object %s", oid_to_hex(&entry->idx.oid)); } else { - if (entry->type < 0) { + if (oe_type(entry) < 0) { /* * This object is not found, but we * don't have to include it anyway. @@ -2542,7 +2553,7 @@ static void read_object_list_from_stdin(void) die("expected object ID, got garbage:\n %s", line); add_preferred_base_object(p + 1); - add_object_entry(&oid, 0, p + 1, 0); + add_object_entry(&oid, OBJ_NONE, p + 1, 0); } } diff --git a/cache.h b/cache.h index bbaf5c349ab..543bb42ce88 100644 --- a/cache.h +++ b/cache.h @@ -373,6 +373,8 @@ extern void free_name_hash(struct index_state *istate); #define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz)) #endif +#define TYPE_BITS 3 + enum object_type { OBJ_BAD = -1, OBJ_NONE = 0, diff --git a/object.h b/object.h index f13f85b2a94..3d305fe868f 100644 --- a/object.h +++ b/object.h @@ -25,7 +25,6 @@ struct object_array { #define OBJECT_ARRAY_INIT { 0, 0, NULL } -#define TYPE_BITS 3 /* * object flag allocation: * revision.h: 0---------10 26 diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index 41ae27fb19a..2df7b3e1449 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -64,12 +64,12 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index, entry->in_pack_pos = i; - switch (entry->type) { + switch (oe_type(entry)) { case OBJ_COMMIT: case OBJ_TREE: case OBJ_BLOB: case OBJ_TAG: - real_type = entry->type; + real_type = oe_type(entry); break; default: @@ -97,7 +97,7 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index, default: die("Missing type information for %s (%d/%d)", oid_to_hex(&entry->idx.oid), real_type, - entry->type); + oe_type(entry)); } } } diff --git a/pack-objects.h b/pack-objects.h index c0a1f61aac8..b4a83a61239 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -59,8 +59,9 @@ struct object_entry { void *delta_data; /* cached delta (uncompressed) */ unsigned long delta_size; /* delta data size (uncompressed) */ unsigned long z_delta_size; /* delta data size (compressed) */ - enum object_type type; - enum object_type in_pack_type; /* could be delta */ + unsigned type_:TYPE_BITS; + unsigned in_pack_type:TYPE_BITS; /* could be delta */ + unsigned type_valid:1; uint32_t hash; /* name hint hash */ unsigned int in_pack_pos; unsigned char in_pack_header_size; @@ -123,4 +124,19 @@ static inline uint32_t pack_name_hash(const char *name) return hash; } +static inline enum object_type oe_type(const struct object_entry *e) +{ + return e->type_valid ? e->type_ : OBJ_BAD; +} + +static inline void oe_set_type(struct object_entry *e, + enum object_type type) +{ + if (type >= OBJ_ANY) + BUG("OBJ_ANY cannot be set in pack-objects code"); + + e->type_valid = type >= OBJ_NONE; + e->type_ = (unsigned)type; +} + #endif -- cgit v1.2.3 From 0c6804ab4ee5cfa47fe28e0a2d20415c5c1f8884 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:02 +0200 Subject: pack-objects: use bitfield for object_entry::dfs_state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 3 +++ pack-objects.h | 28 +++++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 88877f1f59d..cc3c31747e0 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) OPT_END(), }; + if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS)) + BUG("too many dfs states, increase OE_DFS_STATE_BITS"); + check_replace_refs = 0; reset_pack_idx_option(&pack_idx_opts); diff --git a/pack-objects.h b/pack-objects.h index b4a83a61239..080ef62d317 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -1,6 +1,21 @@ #ifndef PACK_OBJECTS_H #define PACK_OBJECTS_H +#define OE_DFS_STATE_BITS 2 + +/* + * State flags for depth-first search used for analyzing delta cycles. + * + * The depth is measured in delta-links to the base (so if A is a delta + * against B, then A has a depth of 1, and B a depth of 0). + */ +enum dfs_state { + DFS_NONE = 0, + DFS_ACTIVE, + DFS_DONE, + DFS_NUM_STATES +}; + /* * basic object info * ----------------- @@ -73,19 +88,10 @@ struct object_entry { unsigned no_try_delta:1; unsigned tagged:1; /* near the very tip of refs */ unsigned filled:1; /* assigned write-order */ + unsigned dfs_state:OE_DFS_STATE_BITS; - /* - * State flags for depth-first search used for analyzing delta cycles. - * - * The depth is measured in delta-links to the base (so if A is a delta - * against B, then A has a depth of 1, and B a depth of 0). - */ - enum { - DFS_NONE = 0, - DFS_ACTIVE, - DFS_DONE - } dfs_state; int depth; + }; struct packing_data { -- cgit v1.2.3 From b5c0cbd8083f71e071207fca0d5434c6db6ff6c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:03 +0200 Subject: pack-objects: use bitfield for object_entry::depth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because of struct packing from now on we can only handle max depth 4095 (or even lower when new booleans are added in this struct). This should be ok since long delta chain will cause significant slow down anyway. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- Documentation/config.txt | 1 + Documentation/git-pack-objects.txt | 4 +++- Documentation/git-repack.txt | 4 +++- builtin/pack-objects.c | 6 ++++++ pack-objects.h | 5 ++--- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Documentation/config.txt b/Documentation/config.txt index 2659153cb37..d97f10722c1 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -2422,6 +2422,7 @@ pack.window:: pack.depth:: The maximum delta depth used by linkgit:git-pack-objects[1] when no maximum depth is given on the command line. Defaults to 50. + Maximum value is 4095. pack.windowMemory:: The maximum size of memory that is consumed by each thread diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index 81bc490ac52..3503c9e3e63 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -96,7 +96,9 @@ base-name:: it too deep affects the performance on the unpacker side, because delta data needs to be applied that many times to get to the necessary object. - The default value for --window is 10 and --depth is 50. ++ +The default value for --window is 10 and --depth is 50. The maximum +depth is 4095. --window-memory=:: This option provides an additional limit on top of `--window`; diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt index ae750e9e114..25c83c4927e 100644 --- a/Documentation/git-repack.txt +++ b/Documentation/git-repack.txt @@ -90,7 +90,9 @@ other objects in that pack they already have locally. space. `--depth` limits the maximum delta depth; making it too deep affects the performance on the unpacker side, because delta data needs to be applied that many times to get to the necessary object. - The default value for --window is 10 and --depth is 50. ++ +The default value for --window is 10 and --depth is 50. The maximum +depth is 4095. --threads=:: This option is passed through to `git pack-objects`. diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index cc3c31747e0..b231e80f17d 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3068,6 +3068,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (pack_to_stdout != !base_name || argc) usage_with_options(pack_usage, pack_objects_options); + if (depth >= (1 << OE_DEPTH_BITS)) { + warning(_("delta chain depth %d is too deep, forcing %d"), + depth, (1 << OE_DEPTH_BITS) - 1); + depth = (1 << OE_DEPTH_BITS) - 1; + } + argv_array_push(&rp, "pack-objects"); if (thin) { use_internal_rev_list = 1; diff --git a/pack-objects.h b/pack-objects.h index 080ef62d317..cdce1648de9 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -2,6 +2,7 @@ #define PACK_OBJECTS_H #define OE_DFS_STATE_BITS 2 +#define OE_DEPTH_BITS 12 /* * State flags for depth-first search used for analyzing delta cycles. @@ -89,9 +90,7 @@ struct object_entry { unsigned tagged:1; /* near the very tip of refs */ unsigned filled:1; /* assigned write-order */ unsigned dfs_state:OE_DFS_STATE_BITS; - - int depth; - + unsigned depth:OE_DEPTH_BITS; }; struct packing_data { -- cgit v1.2.3 From 06af3bba414b832fe9e04fb959daa2b9b678d2d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:04 +0200 Subject: pack-objects: move in_pack_pos out of struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This field is only need for pack-bitmap, which is an optional feature. Move it to a separate array that is only allocated when pack-bitmap is used (like objects[], it is not freed, since we need it until the end of the process) Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 3 ++- pack-bitmap-write.c | 8 +++++--- pack-bitmap.c | 2 +- pack-bitmap.h | 4 +++- pack-objects.h | 16 +++++++++++++++- 5 files changed, 26 insertions(+), 7 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index b231e80f17d..d9c89e87cd2 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -879,7 +879,8 @@ static void write_pack_file(void) if (write_bitmap_index) { bitmap_writer_set_checksum(oid.hash); - bitmap_writer_build_type_index(written_list, nr_written); + bitmap_writer_build_type_index( + &to_pack, written_list, nr_written); } finish_tmp_packfile(&tmpname, pack_tmp_name, diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index 2df7b3e1449..d707fc9ea2f 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -48,7 +48,8 @@ void bitmap_writer_show_progress(int show) /** * Build the initial type index for the packfile */ -void bitmap_writer_build_type_index(struct pack_idx_entry **index, +void bitmap_writer_build_type_index(struct packing_data *to_pack, + struct pack_idx_entry **index, uint32_t index_nr) { uint32_t i; @@ -57,12 +58,13 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index, writer.trees = ewah_new(); writer.blobs = ewah_new(); writer.tags = ewah_new(); + ALLOC_ARRAY(to_pack->in_pack_pos, to_pack->nr_objects); for (i = 0; i < index_nr; ++i) { struct object_entry *entry = (struct object_entry *)index[i]; enum object_type real_type; - entry->in_pack_pos = i; + oe_set_in_pack_pos(to_pack, entry, i); switch (oe_type(entry)) { case OBJ_COMMIT: @@ -146,7 +148,7 @@ static uint32_t find_object_pos(const unsigned char *sha1) "(object %s is missing)", sha1_to_hex(sha1)); } - return entry->in_pack_pos; + return oe_in_pack_pos(writer.to_pack, entry); } static void show_object(struct object *object, const char *name, void *data) diff --git a/pack-bitmap.c b/pack-bitmap.c index 3f2dab340f6..c9e90d1bb53 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -1033,7 +1033,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, oe = packlist_find(mapping, sha1, NULL); if (oe) - reposition[i] = oe->in_pack_pos + 1; + reposition[i] = oe_in_pack_pos(mapping, oe) + 1; } rebuild = bitmap_new(); diff --git a/pack-bitmap.h b/pack-bitmap.h index 3742a00e14a..5ded2f139a6 100644 --- a/pack-bitmap.h +++ b/pack-bitmap.h @@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi void bitmap_writer_show_progress(int show); void bitmap_writer_set_checksum(unsigned char *sha1); -void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr); +void bitmap_writer_build_type_index(struct packing_data *to_pack, + struct pack_idx_entry **index, + uint32_t index_nr); void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack); void bitmap_writer_select_commits(struct commit **indexed_commits, unsigned int indexed_commits_nr, int max_bitmaps); diff --git a/pack-objects.h b/pack-objects.h index cdce1648de9..71ea992c3cd 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -79,7 +79,6 @@ struct object_entry { unsigned in_pack_type:TYPE_BITS; /* could be delta */ unsigned type_valid:1; uint32_t hash; /* name hint hash */ - unsigned int in_pack_pos; unsigned char in_pack_header_size; unsigned preferred_base:1; /* * we do not pack this, but is available @@ -99,6 +98,8 @@ struct packing_data { int32_t *index; uint32_t index_size; + + unsigned int *in_pack_pos; }; struct object_entry *packlist_alloc(struct packing_data *pdata, @@ -144,4 +145,17 @@ static inline void oe_set_type(struct object_entry *e, e->type_ = (unsigned)type; } +static inline unsigned int oe_in_pack_pos(const struct packing_data *pack, + const struct object_entry *e) +{ + return pack->in_pack_pos[e - pack->objects]; +} + +static inline void oe_set_in_pack_pos(const struct packing_data *pack, + const struct object_entry *e, + unsigned int pos) +{ + pack->in_pack_pos[e - pack->objects] = pos; +} + #endif -- cgit v1.2.3 From 43fa44fa3b68e6570145126892e1e43380d7bb5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:05 +0200 Subject: pack-objects: move in_pack out of struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using 8 bytes (on 64 bit arch) to store a pointer to a pack. Use an index instead since the number of packs should be relatively small. This limits the number of packs we can handle to 1k. Since we can't be sure people can never run into the situation where they have more than 1k pack files. Provide a fall back route for it. If we find out they have too many packs, the new in_pack_by_idx[] array (which has at most 1k elements) will not be used. Instead we allocate in_pack[] array that holds nr_objects elements. This is similar to how the optional in_pack_pos field is handled. The new simple test is just to make sure the too-many-packs code path is at least executed. The true test is running make test GIT_TEST_FULL_IN_PACK_ARRAY=1 to take advantage of other special case tests. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 28 +++++++++++++--------- object-store.h | 1 + pack-objects.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++ pack-objects.h | 38 ++++++++++++++++++++++++++++- t/README | 5 ++++ t/t5300-pack-object.sh | 5 ++++ 6 files changed, 130 insertions(+), 12 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index d9c89e87cd2..2784d58ec22 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -31,6 +31,8 @@ #include "packfile.h" #include "object-store.h" +#define IN_PACK(obj) oe_in_pack(&to_pack, obj) + static const char *pack_usage[] = { N_("git pack-objects --stdout [...] [< | < ]"), N_("git pack-objects [...] [< | < ]"), @@ -367,7 +369,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, unsigned long limit, int usable_delta) { - struct packed_git *p = entry->in_pack; + struct packed_git *p = IN_PACK(entry); struct pack_window *w_curs = NULL; struct revindex_entry *revidx; off_t offset; @@ -478,7 +480,7 @@ static off_t write_object(struct hashfile *f, if (!reuse_object) to_reuse = 0; /* explicit */ - else if (!entry->in_pack) + else if (!IN_PACK(entry)) to_reuse = 0; /* can't reuse what we don't have */ else if (oe_type(entry) == OBJ_REF_DELTA || oe_type(entry) == OBJ_OFS_DELTA) @@ -1074,7 +1076,7 @@ static void create_object_entry(const struct object_id *oid, else nr_result++; if (found_pack) { - entry->in_pack = found_pack; + oe_set_in_pack(&to_pack, entry, found_pack); entry->in_pack_offset = found_offset; } @@ -1399,8 +1401,8 @@ static void cleanup_preferred_base(void) static void check_object(struct object_entry *entry) { - if (entry->in_pack) { - struct packed_git *p = entry->in_pack; + if (IN_PACK(entry)) { + struct packed_git *p = IN_PACK(entry); struct pack_window *w_curs = NULL; const unsigned char *base_ref = NULL; struct object_entry *base_entry; @@ -1535,14 +1537,16 @@ static int pack_offset_sort(const void *_a, const void *_b) { const struct object_entry *a = *(struct object_entry **)_a; const struct object_entry *b = *(struct object_entry **)_b; + const struct packed_git *a_in_pack = IN_PACK(a); + const struct packed_git *b_in_pack = IN_PACK(b); /* avoid filesystem trashing with loose objects */ - if (!a->in_pack && !b->in_pack) + if (!a_in_pack && !b_in_pack) return oidcmp(&a->idx.oid, &b->idx.oid); - if (a->in_pack < b->in_pack) + if (a_in_pack < b_in_pack) return -1; - if (a->in_pack > b->in_pack) + if (a_in_pack > b_in_pack) return 1; return a->in_pack_offset < b->in_pack_offset ? -1 : (a->in_pack_offset > b->in_pack_offset); @@ -1578,7 +1582,7 @@ static void drop_reused_delta(struct object_entry *entry) oi.sizep = &entry->size; oi.typep = &type; - if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) { + if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) { /* * We failed to get the info from this pack for some reason; * fall back to sha1_object_info, which may find another copy. @@ -1848,8 +1852,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, * it, we will still save the transfer cost, as we already know * the other side has it and we won't send src_entry at all. */ - if (reuse_delta && trg_entry->in_pack && - trg_entry->in_pack == src_entry->in_pack && + if (reuse_delta && IN_PACK(trg_entry) && + IN_PACK(trg_entry) == IN_PACK(src_entry) && !src_entry->preferred_base && trg_entry->in_pack_type != OBJ_REF_DELTA && trg_entry->in_pack_type != OBJ_OFS_DELTA) @@ -3192,6 +3196,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) } } + prepare_packing_data(&to_pack); + if (progress) progress_state = start_progress(_("Counting objects"), 0); if (!use_internal_rev_list) diff --git a/object-store.h b/object-store.h index fef33f345f0..71d40878674 100644 --- a/object-store.h +++ b/object-store.h @@ -69,6 +69,7 @@ struct packed_git { int index_version; time_t mtime; int pack_fd; + int index; /* for builtin/pack-objects.c */ unsigned pack_local:1, pack_keep:1, freshened:1, diff --git a/pack-objects.c b/pack-objects.c index 9558d13834e..08cfe68b6be 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -2,6 +2,8 @@ #include "object.h" #include "pack.h" #include "pack-objects.h" +#include "packfile.h" +#include "config.h" static uint32_t locate_object_entry_hash(struct packing_data *pdata, const unsigned char *sha1, @@ -86,6 +88,63 @@ struct object_entry *packlist_find(struct packing_data *pdata, return &pdata->objects[pdata->index[i] - 1]; } +static void prepare_in_pack_by_idx(struct packing_data *pdata) +{ + struct packed_git **mapping, *p; + int cnt = 0, nr = 1U << OE_IN_PACK_BITS; + + ALLOC_ARRAY(mapping, nr); + /* + * oe_in_pack() on an all-zero'd object_entry + * (i.e. in_pack_idx also zero) should return NULL. + */ + mapping[cnt++] = NULL; + for (p = get_packed_git(the_repository); p; p = p->next, cnt++) { + if (cnt == nr) { + free(mapping); + return; + } + p->index = cnt; + mapping[cnt] = p; + } + pdata->in_pack_by_idx = mapping; +} + +/* + * A new pack appears after prepare_in_pack_by_idx() has been + * run. This is likely a race. + * + * We could map this new pack to in_pack_by_idx[] array, but then we + * have to deal with full array anyway. And since it's hard to test + * this fall back code, just stay simple and fall back to using + * in_pack[] array. + */ +void oe_map_new_pack(struct packing_data *pack, + struct packed_git *p) +{ + uint32_t i; + + REALLOC_ARRAY(pack->in_pack, pack->nr_alloc); + + for (i = 0; i < pack->nr_objects; i++) + pack->in_pack[i] = oe_in_pack(pack, pack->objects + i); + + FREE_AND_NULL(pack->in_pack_by_idx); +} + +/* assume pdata is already zero'd by caller */ +void prepare_packing_data(struct packing_data *pdata) +{ + if (git_env_bool("GIT_TEST_FULL_IN_PACK_ARRAY", 0)) { + /* + * do not initialize in_pack_by_idx[] to force the + * slow path in oe_in_pack() + */ + } else { + prepare_in_pack_by_idx(pdata); + } +} + struct object_entry *packlist_alloc(struct packing_data *pdata, const unsigned char *sha1, uint32_t index_pos) @@ -95,6 +154,9 @@ struct object_entry *packlist_alloc(struct packing_data *pdata, if (pdata->nr_objects >= pdata->nr_alloc) { pdata->nr_alloc = (pdata->nr_alloc + 1024) * 3 / 2; REALLOC_ARRAY(pdata->objects, pdata->nr_alloc); + + if (!pdata->in_pack_by_idx) + REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc); } new_entry = pdata->objects + pdata->nr_objects++; @@ -107,5 +169,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata, else pdata->index[index_pos] = pdata->nr_objects; + if (pdata->in_pack) + pdata->in_pack[pdata->nr_objects - 1] = NULL; + return new_entry; } diff --git a/pack-objects.h b/pack-objects.h index 71ea992c3cd..7bedd5af811 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -1,8 +1,11 @@ #ifndef PACK_OBJECTS_H #define PACK_OBJECTS_H +#include "object-store.h" + #define OE_DFS_STATE_BITS 2 #define OE_DEPTH_BITS 12 +#define OE_IN_PACK_BITS 10 /* * State flags for depth-first search used for analyzing delta cycles. @@ -65,7 +68,7 @@ enum dfs_state { struct object_entry { struct pack_idx_entry idx; unsigned long size; /* uncompressed size */ - struct packed_git *in_pack; /* already in pack */ + unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ off_t in_pack_offset; struct object_entry *delta; /* delta base object */ struct object_entry *delta_child; /* deltified objects who bases me */ @@ -100,8 +103,18 @@ struct packing_data { uint32_t index_size; unsigned int *in_pack_pos; + + /* + * Only one of these can be non-NULL and they have different + * sizes. if in_pack_by_idx is allocated, oe_in_pack() returns + * the pack of an object using in_pack_idx field. If not, + * in_pack[] array is used the same way as in_pack_pos[] + */ + struct packed_git **in_pack_by_idx; + struct packed_git **in_pack; }; +void prepare_packing_data(struct packing_data *pdata); struct object_entry *packlist_alloc(struct packing_data *pdata, const unsigned char *sha1, uint32_t index_pos); @@ -158,4 +171,27 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack, pack->in_pack_pos[e - pack->objects] = pos; } +static inline struct packed_git *oe_in_pack(const struct packing_data *pack, + const struct object_entry *e) +{ + if (pack->in_pack_by_idx) + return pack->in_pack_by_idx[e->in_pack_idx]; + else + return pack->in_pack[e - pack->objects]; +} + +void oe_map_new_pack(struct packing_data *pack, + struct packed_git *p); +static inline void oe_set_in_pack(struct packing_data *pack, + struct object_entry *e, + struct packed_git *p) +{ + if (!p->index) + oe_map_new_pack(pack, p); + if (pack->in_pack_by_idx) + e->in_pack_idx = p->index; + else + pack->in_pack[e - pack->objects] = p; +} + #endif diff --git a/t/README b/t/README index d5e0a3c786d..a06a85bae8a 100644 --- a/t/README +++ b/t/README @@ -304,6 +304,11 @@ environment set. GIT_TEST_SPLIT_INDEX= forces split-index mode on the whole test suite. Accept any boolean values that are accepted by git-config. +GIT_TEST_FULL_IN_PACK_ARRAY= exercises the uncommon +pack-objects code path where there are more than 1024 packs even if +the actual number of packs in repository is below this limit. Accept +any boolean values that are accepted by git-config. + Naming Tests ------------ diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh index 65ff60f2ee9..54eff03851d 100755 --- a/t/t5300-pack-object.sh +++ b/t/t5300-pack-object.sh @@ -457,6 +457,11 @@ test_expect_success !PTHREADS,C_LOCALE_OUTPUT 'pack-objects --threads=N or pack. grep -F "no threads support, ignoring pack.threads" err ' +test_expect_success 'pack-objects in too-many-packs mode' ' + GIT_TEST_FULL_IN_PACK_ARRAY=1 git repack -ad && + git fsck +' + # # WARNING! # -- cgit v1.2.3 From 898eba5e630696608ddca0ede489378e87464ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:06 +0200 Subject: pack-objects: refer to delta objects by index instead of pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These delta pointers always point to elements in the objects[] array in packing_data struct. We can only hold maximum 4G of those objects because the array size in nr_objects is uint32_t. We could use uint32_t indexes to address these elements instead of pointers. On 64-bit architecture (8 bytes per pointer) this would save 4 bytes per pointer. Convert these delta pointers to indexes. Since we need to handle NULL pointers as well, the index is shifted by one [1]. [1] This means we can only index 2^32-2 objects even though nr_objects could contain 2^32-1 objects. It should not be a problem in practice because when we grow objects[], nr_alloc would probably blow up long before nr_objects hits the wall. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 117 ++++++++++++++++++++++++++----------------------- pack-objects.h | 67 +++++++++++++++++++++++++--- 2 files changed, 125 insertions(+), 59 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 2784d58ec22..ec02641d2e7 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -32,6 +32,12 @@ #include "object-store.h" #define IN_PACK(obj) oe_in_pack(&to_pack, obj) +#define DELTA(obj) oe_delta(&to_pack, obj) +#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj) +#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj) +#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val) +#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val) +#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val) static const char *pack_usage[] = { N_("git pack-objects --stdout [...] [< | < ]"), @@ -129,10 +135,11 @@ static void *get_delta(struct object_entry *entry) buf = read_object_file(&entry->idx.oid, &type, &size); if (!buf) die("unable to read %s", oid_to_hex(&entry->idx.oid)); - base_buf = read_object_file(&entry->delta->idx.oid, &type, &base_size); + base_buf = read_object_file(&DELTA(entry)->idx.oid, &type, + &base_size); if (!base_buf) die("unable to read %s", - oid_to_hex(&entry->delta->idx.oid)); + oid_to_hex(&DELTA(entry)->idx.oid)); delta_buf = diff_delta(base_buf, base_size, buf, size, &delta_size, 0); if (!delta_buf || delta_size != entry->delta_size) @@ -288,12 +295,12 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent size = entry->delta_size; buf = entry->delta_data; entry->delta_data = NULL; - type = (allow_ofs_delta && entry->delta->idx.offset) ? + type = (allow_ofs_delta && DELTA(entry)->idx.offset) ? OBJ_OFS_DELTA : OBJ_REF_DELTA; } else { buf = get_delta(entry); size = entry->delta_size; - type = (allow_ofs_delta && entry->delta->idx.offset) ? + type = (allow_ofs_delta && DELTA(entry)->idx.offset) ? OBJ_OFS_DELTA : OBJ_REF_DELTA; } @@ -317,7 +324,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent * encoding of the relative offset for the delta * base from this object's position in the pack. */ - off_t ofs = entry->idx.offset - entry->delta->idx.offset; + off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset; unsigned pos = sizeof(dheader) - 1; dheader[pos] = ofs & 127; while (ofs >>= 7) @@ -343,7 +350,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent return 0; } hashwrite(f, header, hdrlen); - hashwrite(f, entry->delta->idx.oid.hash, 20); + hashwrite(f, DELTA(entry)->idx.oid.hash, 20); hdrlen += 20; } else { if (limit && hdrlen + datalen + 20 >= limit) { @@ -379,8 +386,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, dheader[MAX_PACK_OBJECT_HEADER]; unsigned hdrlen; - if (entry->delta) - type = (allow_ofs_delta && entry->delta->idx.offset) ? + if (DELTA(entry)) + type = (allow_ofs_delta && DELTA(entry)->idx.offset) ? OBJ_OFS_DELTA : OBJ_REF_DELTA; hdrlen = encode_in_pack_object_header(header, sizeof(header), type, entry->size); @@ -408,7 +415,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, } if (type == OBJ_OFS_DELTA) { - off_t ofs = entry->idx.offset - entry->delta->idx.offset; + off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset; unsigned pos = sizeof(dheader) - 1; dheader[pos] = ofs & 127; while (ofs >>= 7) @@ -427,7 +434,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, return 0; } hashwrite(f, header, hdrlen); - hashwrite(f, entry->delta->idx.oid.hash, 20); + hashwrite(f, DELTA(entry)->idx.oid.hash, 20); hdrlen += 20; reused_delta++; } else { @@ -467,13 +474,13 @@ static off_t write_object(struct hashfile *f, else limit = pack_size_limit - write_offset; - if (!entry->delta) + if (!DELTA(entry)) usable_delta = 0; /* no delta */ else if (!pack_size_limit) usable_delta = 1; /* unlimited packfile */ - else if (entry->delta->idx.offset == (off_t)-1) + else if (DELTA(entry)->idx.offset == (off_t)-1) usable_delta = 0; /* base was written to another pack */ - else if (entry->delta->idx.offset) + else if (DELTA(entry)->idx.offset) usable_delta = 1; /* base already exists in this pack */ else usable_delta = 0; /* base could end up in another pack */ @@ -489,7 +496,7 @@ static off_t write_object(struct hashfile *f, /* ... but pack split may override that */ else if (oe_type(entry) != entry->in_pack_type) to_reuse = 0; /* pack has delta which is unusable */ - else if (entry->delta) + else if (DELTA(entry)) to_reuse = 0; /* we want to pack afresh */ else to_reuse = 1; /* we have it in-pack undeltified, @@ -541,12 +548,12 @@ static enum write_one_status write_one(struct hashfile *f, } /* if we are deltified, write out base object first. */ - if (e->delta) { + if (DELTA(e)) { e->idx.offset = 1; /* now recurse */ - switch (write_one(f, e->delta, offset)) { + switch (write_one(f, DELTA(e), offset)) { case WRITE_ONE_RECURSIVE: /* we cannot depend on this one */ - e->delta = NULL; + SET_DELTA(e, NULL); break; default: break; @@ -608,34 +615,34 @@ static void add_descendants_to_write_order(struct object_entry **wo, /* add this node... */ add_to_write_order(wo, endp, e); /* all its siblings... */ - for (s = e->delta_sibling; s; s = s->delta_sibling) { + for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) { add_to_write_order(wo, endp, s); } } /* drop down a level to add left subtree nodes if possible */ - if (e->delta_child) { + if (DELTA_CHILD(e)) { add_to_order = 1; - e = e->delta_child; + e = DELTA_CHILD(e); } else { add_to_order = 0; /* our sibling might have some children, it is next */ - if (e->delta_sibling) { - e = e->delta_sibling; + if (DELTA_SIBLING(e)) { + e = DELTA_SIBLING(e); continue; } /* go back to our parent node */ - e = e->delta; - while (e && !e->delta_sibling) { + e = DELTA(e); + while (e && !DELTA_SIBLING(e)) { /* we're on the right side of a subtree, keep * going up until we can go right again */ - e = e->delta; + e = DELTA(e); } if (!e) { /* done- we hit our original root node */ return; } /* pass it off to sibling at this level */ - e = e->delta_sibling; + e = DELTA_SIBLING(e); } }; } @@ -646,7 +653,7 @@ static void add_family_to_write_order(struct object_entry **wo, { struct object_entry *root; - for (root = e; root->delta; root = root->delta) + for (root = e; DELTA(root); root = DELTA(root)) ; /* nothing */ add_descendants_to_write_order(wo, endp, root); } @@ -661,8 +668,8 @@ static struct object_entry **compute_write_order(void) for (i = 0; i < to_pack.nr_objects; i++) { objects[i].tagged = 0; objects[i].filled = 0; - objects[i].delta_child = NULL; - objects[i].delta_sibling = NULL; + SET_DELTA_CHILD(&objects[i], NULL); + SET_DELTA_SIBLING(&objects[i], NULL); } /* @@ -672,11 +679,11 @@ static struct object_entry **compute_write_order(void) */ for (i = to_pack.nr_objects; i > 0;) { struct object_entry *e = &objects[--i]; - if (!e->delta) + if (!DELTA(e)) continue; /* Mark me as the first child */ - e->delta_sibling = e->delta->delta_child; - e->delta->delta_child = e; + e->delta_sibling_idx = DELTA(e)->delta_child_idx; + SET_DELTA_CHILD(DELTA(e), e); } /* @@ -1493,10 +1500,10 @@ static void check_object(struct object_entry *entry) * circular deltas. */ oe_set_type(entry, entry->in_pack_type); - entry->delta = base_entry; + SET_DELTA(entry, base_entry); entry->delta_size = entry->size; - entry->delta_sibling = base_entry->delta_child; - base_entry->delta_child = entry; + entry->delta_sibling_idx = base_entry->delta_child_idx; + SET_DELTA_CHILD(base_entry, entry); unuse_pack(&w_curs); return; } @@ -1567,17 +1574,19 @@ static int pack_offset_sort(const void *_a, const void *_b) */ static void drop_reused_delta(struct object_entry *entry) { - struct object_entry **p = &entry->delta->delta_child; + unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx; struct object_info oi = OBJECT_INFO_INIT; enum object_type type; - while (*p) { - if (*p == entry) - *p = (*p)->delta_sibling; + while (*idx) { + struct object_entry *oe = &to_pack.objects[*idx - 1]; + + if (oe == entry) + *idx = oe->delta_sibling_idx; else - p = &(*p)->delta_sibling; + idx = &oe->delta_sibling_idx; } - entry->delta = NULL; + SET_DELTA(entry, NULL); entry->depth = 0; oi.sizep = &entry->size; @@ -1617,7 +1626,7 @@ static void break_delta_chains(struct object_entry *entry) for (cur = entry, total_depth = 0; cur; - cur = cur->delta, total_depth++) { + cur = DELTA(cur), total_depth++) { if (cur->dfs_state == DFS_DONE) { /* * We've already seen this object and know it isn't @@ -1642,7 +1651,7 @@ static void break_delta_chains(struct object_entry *entry) * it's not a delta, we're done traversing, but we'll mark it * done to save time on future traversals. */ - if (!cur->delta) { + if (!DELTA(cur)) { cur->dfs_state = DFS_DONE; break; } @@ -1665,7 +1674,7 @@ static void break_delta_chains(struct object_entry *entry) * We keep all commits in the chain that we examined. */ cur->dfs_state = DFS_ACTIVE; - if (cur->delta->dfs_state == DFS_ACTIVE) { + if (DELTA(cur)->dfs_state == DFS_ACTIVE) { drop_reused_delta(cur); cur->dfs_state = DFS_DONE; break; @@ -1680,7 +1689,7 @@ static void break_delta_chains(struct object_entry *entry) * an extra "next" pointer to keep going after we reset cur->delta. */ for (cur = entry; cur; cur = next) { - next = cur->delta; + next = DELTA(cur); /* * We should have a chain of zero or more ACTIVE states down to @@ -1865,7 +1874,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, /* Now some size filtering heuristics. */ trg_size = trg_entry->size; - if (!trg_entry->delta) { + if (!DELTA(trg_entry)) { max_size = trg_size/2 - 20; ref_depth = 1; } else { @@ -1939,7 +1948,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, if (!delta_buf) return 0; - if (trg_entry->delta) { + if (DELTA(trg_entry)) { /* Prefer only shallower same-sized deltas. */ if (delta_size == trg_entry->delta_size && src->depth + 1 >= trg->depth) { @@ -1968,7 +1977,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, free(delta_buf); } - trg_entry->delta = src_entry; + SET_DELTA(trg_entry, src_entry); trg_entry->delta_size = delta_size; trg->depth = src->depth + 1; @@ -1977,13 +1986,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, static unsigned int check_delta_limit(struct object_entry *me, unsigned int n) { - struct object_entry *child = me->delta_child; + struct object_entry *child = DELTA_CHILD(me); unsigned int m = n; while (child) { unsigned int c = check_delta_limit(child, n + 1); if (m < c) m = c; - child = child->delta_sibling; + child = DELTA_SIBLING(child); } return m; } @@ -2052,7 +2061,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size, * otherwise they would become too deep. */ max_depth = depth; - if (entry->delta_child) { + if (DELTA_CHILD(entry)) { max_depth -= check_delta_limit(entry, 0); if (max_depth <= 0) goto next; @@ -2102,7 +2111,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size, * depth, leaving it in the window is pointless. we * should evict it first. */ - if (entry->delta && max_depth <= n->depth) + if (DELTA(entry) && max_depth <= n->depth) continue; /* @@ -2110,7 +2119,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size, * currently deltified object, to keep it longer. It will * be the first base object to be attempted next. */ - if (entry->delta) { + if (DELTA(entry)) { struct unpacked swap = array[best_base]; int dist = (window + idx - best_base) % window; int dst = best_base; @@ -2431,7 +2440,7 @@ static void prepare_pack(int window, int depth) for (i = 0; i < to_pack.nr_objects; i++) { struct object_entry *entry = to_pack.objects + i; - if (entry->delta) + if (DELTA(entry)) /* This happens if we decided to reuse existing * delta from a pack. "reuse_delta &&" is implied. */ diff --git a/pack-objects.h b/pack-objects.h index 7bedd5af811..e962dce3c05 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -70,11 +70,11 @@ struct object_entry { unsigned long size; /* uncompressed size */ unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ off_t in_pack_offset; - struct object_entry *delta; /* delta base object */ - struct object_entry *delta_child; /* deltified objects who bases me */ - struct object_entry *delta_sibling; /* other deltified objects who - * uses the same base as me - */ + uint32_t delta_idx; /* delta base object */ + uint32_t delta_child_idx; /* deltified objects who bases me */ + uint32_t delta_sibling_idx; /* other deltified objects who + * uses the same base as me + */ void *delta_data; /* cached delta (uncompressed) */ unsigned long delta_size; /* delta data size (uncompressed) */ unsigned long z_delta_size; /* delta data size (compressed) */ @@ -194,4 +194,61 @@ static inline void oe_set_in_pack(struct packing_data *pack, pack->in_pack[e - pack->objects] = p; } +static inline struct object_entry *oe_delta( + const struct packing_data *pack, + const struct object_entry *e) +{ + if (e->delta_idx) + return &pack->objects[e->delta_idx - 1]; + return NULL; +} + +static inline void oe_set_delta(struct packing_data *pack, + struct object_entry *e, + struct object_entry *delta) +{ + if (delta) + e->delta_idx = (delta - pack->objects) + 1; + else + e->delta_idx = 0; +} + +static inline struct object_entry *oe_delta_child( + const struct packing_data *pack, + const struct object_entry *e) +{ + if (e->delta_child_idx) + return &pack->objects[e->delta_child_idx - 1]; + return NULL; +} + +static inline void oe_set_delta_child(struct packing_data *pack, + struct object_entry *e, + struct object_entry *delta) +{ + if (delta) + e->delta_child_idx = (delta - pack->objects) + 1; + else + e->delta_child_idx = 0; +} + +static inline struct object_entry *oe_delta_sibling( + const struct packing_data *pack, + const struct object_entry *e) +{ + if (e->delta_sibling_idx) + return &pack->objects[e->delta_sibling_idx - 1]; + return NULL; +} + +static inline void oe_set_delta_sibling(struct packing_data *pack, + struct object_entry *e, + struct object_entry *delta) +{ + if (delta) + e->delta_sibling_idx = (delta - pack->objects) + 1; + else + e->delta_sibling_idx = 0; +} + #endif -- cgit v1.2.3 From 0cb3c1427a1e9cee638e0c1629933c907493d7e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:07 +0200 Subject: pack-objects: shrink z_delta_size field in struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We only cache deltas when it's smaller than a certain limit. This limit defaults to 1000 but save its compressed length in a 64-bit field. Shrink that field down to 20 bits, so you can only cache 1MB deltas. Larger deltas must be recomputed at when the pack is written down. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- Documentation/config.txt | 3 ++- builtin/pack-objects.c | 24 ++++++++++++++++++------ pack-objects.h | 3 ++- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/Documentation/config.txt b/Documentation/config.txt index d97f10722c1..a62134264e3 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -2459,7 +2459,8 @@ pack.deltaCacheLimit:: The maximum size of a delta, that is cached in linkgit:git-pack-objects[1]. This cache is used to speed up the writing object phase by not having to recompute the final delta - result once the best match for all objects is found. Defaults to 1000. + result once the best match for all objects is found. + Defaults to 1000. Maximum value is 65535. pack.threads:: Specifies the number of threads to spawn when searching for best diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index ec02641d2e7..211bb1ad0ef 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -2099,12 +2099,19 @@ static void find_deltas(struct object_entry **list, unsigned *list_size, * between writes at that moment. */ if (entry->delta_data && !pack_to_stdout) { - entry->z_delta_size = do_compress(&entry->delta_data, - entry->delta_size); - cache_lock(); - delta_cache_size -= entry->delta_size; - delta_cache_size += entry->z_delta_size; - cache_unlock(); + unsigned long size; + + size = do_compress(&entry->delta_data, entry->delta_size); + if (size < (1U << OE_Z_DELTA_BITS)) { + entry->z_delta_size = size; + cache_lock(); + delta_cache_size -= entry->delta_size; + delta_cache_size += entry->z_delta_size; + cache_unlock(); + } else { + FREE_AND_NULL(entry->delta_data); + entry->z_delta_size = 0; + } } /* if we made n a delta, and if n is already at max @@ -3087,6 +3094,11 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) depth, (1 << OE_DEPTH_BITS) - 1); depth = (1 << OE_DEPTH_BITS) - 1; } + if (cache_max_small_delta_size >= (1U << OE_Z_DELTA_BITS)) { + warning(_("pack.deltaCacheLimit is too high, forcing %d"), + (1U << OE_Z_DELTA_BITS) - 1); + cache_max_small_delta_size = (1U << OE_Z_DELTA_BITS) - 1; + } argv_array_push(&rp, "pack-objects"); if (thin) { diff --git a/pack-objects.h b/pack-objects.h index e962dce3c05..9d0391c1739 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -6,6 +6,7 @@ #define OE_DFS_STATE_BITS 2 #define OE_DEPTH_BITS 12 #define OE_IN_PACK_BITS 10 +#define OE_Z_DELTA_BITS 20 /* * State flags for depth-first search used for analyzing delta cycles. @@ -77,7 +78,7 @@ struct object_entry { */ void *delta_data; /* cached delta (uncompressed) */ unsigned long delta_size; /* delta data size (uncompressed) */ - unsigned long z_delta_size; /* delta data size (compressed) */ + unsigned z_delta_size:OE_Z_DELTA_BITS; unsigned type_:TYPE_BITS; unsigned in_pack_type:TYPE_BITS; /* could be delta */ unsigned type_valid:1; -- cgit v1.2.3 From 660b373542589157aff78dd37ce582237027ba94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:08 +0200 Subject: pack-objects: don't check size when the object is bad MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sha1_object_info() in check_objects() may fail to locate an object in the pack and return type OBJ_BAD. In that case, it will likely leave the "size" field untouched. We delay error handling until later in prepare_pack() though. Until then, do not touch "size" field. This field should contain the default value zero, but we can't say sha1_object_info() cannot damage it. This becomes more important later when the object size may have to be retrieved back from the (non-existing) pack. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 211bb1ad0ef..e75693176ea 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -1742,7 +1742,7 @@ static void get_object_details(void) for (i = 0; i < to_pack.nr_objects; i++) { struct object_entry *entry = sorted_by_offset[i]; check_object(entry); - if (big_file_threshold < entry->size) + if (entry->type_valid && big_file_threshold < entry->size) entry->no_try_delta = 1; } @@ -2453,7 +2453,7 @@ static void prepare_pack(int window, int depth) */ continue; - if (entry->size < 50) + if (!entry->type_valid || entry->size < 50) continue; if (entry->no_try_delta) -- cgit v1.2.3 From 27a7d0679f17bd536f566e76e51058de0e1fa17a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:09 +0200 Subject: pack-objects: clarify the use of object_entry::size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While this field most of the time contains the canonical object size, there is one case it does not: when we have found that the base object of the delta in question is also to be packed, we will very happily reuse the delta by copying it over instead of regenerating the new delta. "size" in this case will record the delta size, not canonical object size. Later on in write_reuse_object(), we reconstruct the delta header and "size" is used for this purpose. When this happens, the "type" field contains a delta type instead of a canonical type. Highlight this in the code since it could be tricky to see. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 11 ++++++++--- pack-objects.h | 4 +++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index e75693176ea..779f14a45ed 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -1418,6 +1418,7 @@ static void check_object(struct object_entry *entry) off_t ofs; unsigned char *buf, c; enum object_type type; + unsigned long in_pack_size; buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail); @@ -1427,7 +1428,7 @@ static void check_object(struct object_entry *entry) */ used = unpack_object_header_buffer(buf, avail, &type, - &entry->size); + &in_pack_size); if (used == 0) goto give_up; @@ -1444,6 +1445,7 @@ static void check_object(struct object_entry *entry) default: /* Not a delta hence we've already got all we need. */ oe_set_type(entry, entry->in_pack_type); + entry->size = in_pack_size; entry->in_pack_header_size = used; if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB) goto give_up; @@ -1500,6 +1502,7 @@ static void check_object(struct object_entry *entry) * circular deltas. */ oe_set_type(entry, entry->in_pack_type); + entry->size = in_pack_size; /* delta size */ SET_DELTA(entry, base_entry); entry->delta_size = entry->size; entry->delta_sibling_idx = base_entry->delta_child_idx; @@ -1509,13 +1512,15 @@ static void check_object(struct object_entry *entry) } if (oe_type(entry)) { + off_t delta_pos; + /* * This must be a delta and we already know what the * final object type is. Let's extract the actual * object size from the delta header. */ - entry->size = get_size_from_delta(p, &w_curs, - entry->in_pack_offset + entry->in_pack_header_size); + delta_pos = entry->in_pack_offset + entry->in_pack_header_size; + entry->size = get_size_from_delta(p, &w_curs, delta_pos); if (entry->size == 0) goto give_up; unuse_pack(&w_curs); diff --git a/pack-objects.h b/pack-objects.h index 9d0391c1739..e4ea6a350c5 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -32,7 +32,9 @@ enum dfs_state { * * "size" is the uncompressed object size. Compressed size of the raw * data for an object in a pack is not stored anywhere but is computed - * and made available when reverse .idx is made. + * and made available when reverse .idx is made. Note that when a + * delta is reused, "size" is the uncompressed _delta_ size, not the + * canonical one after the delta has been applied. * * "hash" contains a path name hash which is used for sorting the * delta list and also during delta searching. Once prepare_pack() -- cgit v1.2.3 From ac77d0c370dc5b23ac1ec4a13c754fe7ffa48564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:10 +0200 Subject: pack-objects: shrink size field in struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's very very rare that an uncompressed object is larger than 4GB (partly because Git does not handle those large files very well to begin with). Let's optimize it for the common case where object size is smaller than this limit. Shrink size field down to 31 bits and one overflow bit. If the size is too large, we read it back from disk. As noted in the previous patch, we need to return the delta size instead of canonical size when the to-be-reused object entry type is a delta instead of a canonical one. Add two compare helpers that can take advantage of the overflow bit (e.g. if the file is 4GB+, chances are it's already larger than core.bigFileThreshold and there's no point in comparing the actual value). Another note about oe_get_size_slow(). This function MUST be thread safe because SIZE() macro is used inside try_delta() which may run in parallel. Outside parallel code, no-contention locking should be dirt cheap (or insignificant compared to i/o access anyway). To exercise this code, it's best to run the test suite with something like make test GIT_TEST_OE_SIZE=4 which forces this code on all objects larger than 3 bytes. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 105 +++++++++++++++++++++++++++++++++++++------------ pack-objects.c | 3 ++ pack-objects.h | 57 ++++++++++++++++++++++++++- t/README | 6 +++ 4 files changed, 145 insertions(+), 26 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 779f14a45ed..cccd0f8040d 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -32,6 +32,8 @@ #include "object-store.h" #define IN_PACK(obj) oe_in_pack(&to_pack, obj) +#define SIZE(obj) oe_size(&to_pack, obj) +#define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size) #define DELTA(obj) oe_delta(&to_pack, obj) #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj) #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj) @@ -276,7 +278,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent if (!usable_delta) { if (oe_type(entry) == OBJ_BLOB && - entry->size > big_file_threshold && + oe_size_greater_than(&to_pack, entry, big_file_threshold) && (st = open_istream(&entry->idx.oid, &type, &size, NULL)) != NULL) buf = NULL; else { @@ -385,12 +387,13 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, unsigned char header[MAX_PACK_OBJECT_HEADER], dheader[MAX_PACK_OBJECT_HEADER]; unsigned hdrlen; + unsigned long entry_size = SIZE(entry); if (DELTA(entry)) type = (allow_ofs_delta && DELTA(entry)->idx.offset) ? OBJ_OFS_DELTA : OBJ_REF_DELTA; hdrlen = encode_in_pack_object_header(header, sizeof(header), - type, entry->size); + type, entry_size); offset = entry->in_pack_offset; revidx = find_pack_revindex(p, offset); @@ -407,7 +410,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, datalen -= entry->in_pack_header_size; if (!pack_to_stdout && p->index_version == 1 && - check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) { + check_pack_inflate(p, &w_curs, offset, datalen, entry_size)) { error("corrupt packed object for %s", oid_to_hex(&entry->idx.oid)); unuse_pack(&w_curs); @@ -1408,6 +1411,8 @@ static void cleanup_preferred_base(void) static void check_object(struct object_entry *entry) { + unsigned long canonical_size; + if (IN_PACK(entry)) { struct packed_git *p = IN_PACK(entry); struct pack_window *w_curs = NULL; @@ -1445,7 +1450,7 @@ static void check_object(struct object_entry *entry) default: /* Not a delta hence we've already got all we need. */ oe_set_type(entry, entry->in_pack_type); - entry->size = in_pack_size; + SET_SIZE(entry, in_pack_size); entry->in_pack_header_size = used; if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB) goto give_up; @@ -1502,9 +1507,9 @@ static void check_object(struct object_entry *entry) * circular deltas. */ oe_set_type(entry, entry->in_pack_type); - entry->size = in_pack_size; /* delta size */ + SET_SIZE(entry, in_pack_size); /* delta size */ SET_DELTA(entry, base_entry); - entry->delta_size = entry->size; + entry->delta_size = in_pack_size; entry->delta_sibling_idx = base_entry->delta_child_idx; SET_DELTA_CHILD(base_entry, entry); unuse_pack(&w_curs); @@ -1520,9 +1525,10 @@ static void check_object(struct object_entry *entry) * object size from the delta header. */ delta_pos = entry->in_pack_offset + entry->in_pack_header_size; - entry->size = get_size_from_delta(p, &w_curs, delta_pos); - if (entry->size == 0) + canonical_size = get_size_from_delta(p, &w_curs, delta_pos); + if (canonical_size == 0) goto give_up; + SET_SIZE(entry, canonical_size); unuse_pack(&w_curs); return; } @@ -1536,13 +1542,17 @@ static void check_object(struct object_entry *entry) unuse_pack(&w_curs); } - oe_set_type(entry, oid_object_info(&entry->idx.oid, &entry->size)); - /* - * The error condition is checked in prepare_pack(). This is - * to permit a missing preferred base object to be ignored - * as a preferred base. Doing so can result in a larger - * pack file, but the transfer will still take place. - */ + oe_set_type(entry, oid_object_info(&entry->idx.oid, &canonical_size)); + if (entry->type_valid) { + SET_SIZE(entry, canonical_size); + } else { + /* + * Bad object type is checked in prepare_pack(). This is + * to permit a missing preferred base object to be ignored + * as a preferred base. Doing so can result in a larger + * pack file, but the transfer will still take place. + */ + } } static int pack_offset_sort(const void *_a, const void *_b) @@ -1582,6 +1592,7 @@ static void drop_reused_delta(struct object_entry *entry) unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx; struct object_info oi = OBJECT_INFO_INIT; enum object_type type; + unsigned long size; while (*idx) { struct object_entry *oe = &to_pack.objects[*idx - 1]; @@ -1594,7 +1605,7 @@ static void drop_reused_delta(struct object_entry *entry) SET_DELTA(entry, NULL); entry->depth = 0; - oi.sizep = &entry->size; + oi.sizep = &size; oi.typep = &type; if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) { /* @@ -1603,11 +1614,11 @@ static void drop_reused_delta(struct object_entry *entry) * And if that fails, the error will be recorded in oe_type(entry) * and dealt with in prepare_pack(). */ - oe_set_type(entry, oid_object_info(&entry->idx.oid, - &entry->size)); + oe_set_type(entry, oid_object_info(&entry->idx.oid, &size)); } else { oe_set_type(entry, type); } + SET_SIZE(entry, size); } /* @@ -1747,7 +1758,8 @@ static void get_object_details(void) for (i = 0; i < to_pack.nr_objects; i++) { struct object_entry *entry = sorted_by_offset[i]; check_object(entry); - if (entry->type_valid && big_file_threshold < entry->size) + if (entry->type_valid && + oe_size_greater_than(&to_pack, entry, big_file_threshold)) entry->no_try_delta = 1; } @@ -1776,6 +1788,8 @@ static int type_size_sort(const void *_a, const void *_b) const struct object_entry *b = *(struct object_entry **)_b; enum object_type a_type = oe_type(a); enum object_type b_type = oe_type(b); + unsigned long a_size = SIZE(a); + unsigned long b_size = SIZE(b); if (a_type > b_type) return -1; @@ -1789,9 +1803,9 @@ static int type_size_sort(const void *_a, const void *_b) return -1; if (a->preferred_base < b->preferred_base) return 1; - if (a->size > b->size) + if (a_size > b_size) return -1; - if (a->size < b->size) + if (a_size < b_size) return 1; return a < b ? -1 : (a > b); /* newest first */ } @@ -1844,6 +1858,46 @@ static pthread_mutex_t progress_mutex; #endif +/* + * Return the size of the object without doing any delta + * reconstruction (so non-deltas are true object sizes, but deltas + * return the size of the delta data). + */ +unsigned long oe_get_size_slow(struct packing_data *pack, + const struct object_entry *e) +{ + struct packed_git *p; + struct pack_window *w_curs; + unsigned char *buf; + enum object_type type; + unsigned long used, avail, size; + + if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) { + read_lock(); + if (oid_object_info(&e->idx.oid, &size) < 0) + die(_("unable to get size of %s"), + oid_to_hex(&e->idx.oid)); + read_unlock(); + return size; + } + + p = oe_in_pack(pack, e); + if (!p) + BUG("when e->type is a delta, it must belong to a pack"); + + read_lock(); + w_curs = NULL; + buf = use_pack(p, &w_curs, e->in_pack_offset, &avail); + used = unpack_object_header_buffer(buf, avail, &type, &size); + if (used == 0) + die(_("unable to parse object header of %s"), + oid_to_hex(&e->idx.oid)); + + unuse_pack(&w_curs); + read_unlock(); + return size; +} + static int try_delta(struct unpacked *trg, struct unpacked *src, unsigned max_depth, unsigned long *mem_usage) { @@ -1878,7 +1932,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, return 0; /* Now some size filtering heuristics. */ - trg_size = trg_entry->size; + trg_size = SIZE(trg_entry); if (!DELTA(trg_entry)) { max_size = trg_size/2 - 20; ref_depth = 1; @@ -1890,7 +1944,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, (max_depth - ref_depth + 1); if (max_size == 0) return 0; - src_size = src_entry->size; + src_size = SIZE(src_entry); sizediff = src_size < trg_size ? trg_size - src_size : 0; if (sizediff >= max_size) return 0; @@ -2008,7 +2062,7 @@ static unsigned long free_unpacked(struct unpacked *n) free_delta_index(n->index); n->index = NULL; if (n->data) { - freed_mem += n->entry->size; + freed_mem += SIZE(n->entry); FREE_AND_NULL(n->data); } n->entry = NULL; @@ -2458,7 +2512,8 @@ static void prepare_pack(int window, int depth) */ continue; - if (!entry->type_valid || entry->size < 50) + if (!entry->type_valid || + oe_size_less_than(&to_pack, entry, 50)) continue; if (entry->no_try_delta) diff --git a/pack-objects.c b/pack-objects.c index 08cfe68b6be..e0c46005680 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -143,6 +143,9 @@ void prepare_packing_data(struct packing_data *pdata) } else { prepare_in_pack_by_idx(pdata); } + + pdata->oe_size_limit = git_env_ulong("GIT_TEST_OE_SIZE", + 1U << OE_SIZE_BITS); } struct object_entry *packlist_alloc(struct packing_data *pdata, diff --git a/pack-objects.h b/pack-objects.h index e4ea6a350c5..ee2c7ab382b 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -7,6 +7,11 @@ #define OE_DEPTH_BITS 12 #define OE_IN_PACK_BITS 10 #define OE_Z_DELTA_BITS 20 +/* + * Note that oe_set_size() becomes expensive when the given size is + * above this limit. Don't lower it too much. + */ +#define OE_SIZE_BITS 31 /* * State flags for depth-first search used for analyzing delta cycles. @@ -70,7 +75,8 @@ enum dfs_state { */ struct object_entry { struct pack_idx_entry idx; - unsigned long size; /* uncompressed size */ + unsigned size_:OE_SIZE_BITS; + unsigned size_valid:1; unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ off_t in_pack_offset; uint32_t delta_idx; /* delta base object */ @@ -115,6 +121,8 @@ struct packing_data { */ struct packed_git **in_pack_by_idx; struct packed_git **in_pack; + + uintmax_t oe_size_limit; }; void prepare_packing_data(struct packing_data *pdata); @@ -254,4 +262,51 @@ static inline void oe_set_delta_sibling(struct packing_data *pack, e->delta_sibling_idx = 0; } +unsigned long oe_get_size_slow(struct packing_data *pack, + const struct object_entry *e); +static inline unsigned long oe_size(struct packing_data *pack, + const struct object_entry *e) +{ + if (e->size_valid) + return e->size_; + + return oe_get_size_slow(pack, e); +} + +static inline int oe_size_less_than(struct packing_data *pack, + const struct object_entry *lhs, + unsigned long rhs) +{ + if (lhs->size_valid) + return lhs->size_ < rhs; + if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */ + return 0; + return oe_get_size_slow(pack, lhs) < rhs; +} + +static inline int oe_size_greater_than(struct packing_data *pack, + const struct object_entry *lhs, + unsigned long rhs) +{ + if (lhs->size_valid) + return lhs->size_ > rhs; + if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */ + return 1; + return oe_get_size_slow(pack, lhs) > rhs; +} + +static inline void oe_set_size(struct packing_data *pack, + struct object_entry *e, + unsigned long size) +{ + if (size < pack->oe_size_limit) { + e->size_ = size; + e->size_valid = 1; + } else { + e->size_valid = 0; + if (oe_get_size_slow(pack, e) != size) + BUG("'size' is supposed to be the object size!"); + } +} + #endif diff --git a/t/README b/t/README index a06a85bae8a..8373a27fea3 100644 --- a/t/README +++ b/t/README @@ -309,6 +309,12 @@ pack-objects code path where there are more than 1024 packs even if the actual number of packs in repository is below this limit. Accept any boolean values that are accepted by git-config. +GIT_TEST_OE_SIZE= exercises the uncommon pack-objects code path +where we do not cache object size in memory and read it from existing +packs on demand. This normally only happens when the object size is +over 2GB. This variable forces the code path on any object larger than + bytes. + Naming Tests ------------ -- cgit v1.2.3 From 0aca34e8269514ebb67676e0470a314615494ae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:11 +0200 Subject: pack-objects: shrink delta_size field in struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allowing a delta size of 64 bits is crazy. Shrink this field down to 20 bits with one overflow bit. If we find an existing delta larger than 1MB, we do not cache delta_size at all and will get the value from oe_size(), potentially from disk if it's larger than 4GB. Note, since DELTA_SIZE() is used in try_delta() code, it must be thread-safe. Luckily oe_size() does guarantee this so we it is thread-safe. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 26 ++++++++++++++++---------- pack-objects.h | 23 ++++++++++++++++++++++- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index cccd0f8040d..88d2bb8153e 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -34,10 +34,12 @@ #define IN_PACK(obj) oe_in_pack(&to_pack, obj) #define SIZE(obj) oe_size(&to_pack, obj) #define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size) +#define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj) #define DELTA(obj) oe_delta(&to_pack, obj) #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj) #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj) #define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val) +#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val) #define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val) #define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val) @@ -144,7 +146,7 @@ static void *get_delta(struct object_entry *entry) oid_to_hex(&DELTA(entry)->idx.oid)); delta_buf = diff_delta(base_buf, base_size, buf, size, &delta_size, 0); - if (!delta_buf || delta_size != entry->delta_size) + if (!delta_buf || delta_size != DELTA_SIZE(entry)) die("delta size changed"); free(buf); free(base_buf); @@ -294,14 +296,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent FREE_AND_NULL(entry->delta_data); entry->z_delta_size = 0; } else if (entry->delta_data) { - size = entry->delta_size; + size = DELTA_SIZE(entry); buf = entry->delta_data; entry->delta_data = NULL; type = (allow_ofs_delta && DELTA(entry)->idx.offset) ? OBJ_OFS_DELTA : OBJ_REF_DELTA; } else { buf = get_delta(entry); - size = entry->delta_size; + size = DELTA_SIZE(entry); type = (allow_ofs_delta && DELTA(entry)->idx.offset) ? OBJ_OFS_DELTA : OBJ_REF_DELTA; } @@ -1509,7 +1511,7 @@ static void check_object(struct object_entry *entry) oe_set_type(entry, entry->in_pack_type); SET_SIZE(entry, in_pack_size); /* delta size */ SET_DELTA(entry, base_entry); - entry->delta_size = in_pack_size; + SET_DELTA_SIZE(entry, in_pack_size); entry->delta_sibling_idx = base_entry->delta_child_idx; SET_DELTA_CHILD(base_entry, entry); unuse_pack(&w_curs); @@ -1937,7 +1939,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, max_size = trg_size/2 - 20; ref_depth = 1; } else { - max_size = trg_entry->delta_size; + max_size = DELTA_SIZE(trg_entry); ref_depth = trg->depth; } max_size = (uint64_t)max_size * (max_depth - src->depth) / @@ -2006,10 +2008,14 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size); if (!delta_buf) return 0; + if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) { + free(delta_buf); + return 0; + } if (DELTA(trg_entry)) { /* Prefer only shallower same-sized deltas. */ - if (delta_size == trg_entry->delta_size && + if (delta_size == DELTA_SIZE(trg_entry) && src->depth + 1 >= trg->depth) { free(delta_buf); return 0; @@ -2024,7 +2030,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, free(trg_entry->delta_data); cache_lock(); if (trg_entry->delta_data) { - delta_cache_size -= trg_entry->delta_size; + delta_cache_size -= DELTA_SIZE(trg_entry); trg_entry->delta_data = NULL; } if (delta_cacheable(src_size, trg_size, delta_size)) { @@ -2037,7 +2043,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, } SET_DELTA(trg_entry, src_entry); - trg_entry->delta_size = delta_size; + SET_DELTA_SIZE(trg_entry, delta_size); trg->depth = src->depth + 1; return 1; @@ -2160,11 +2166,11 @@ static void find_deltas(struct object_entry **list, unsigned *list_size, if (entry->delta_data && !pack_to_stdout) { unsigned long size; - size = do_compress(&entry->delta_data, entry->delta_size); + size = do_compress(&entry->delta_data, DELTA_SIZE(entry)); if (size < (1U << OE_Z_DELTA_BITS)) { entry->z_delta_size = size; cache_lock(); - delta_cache_size -= entry->delta_size; + delta_cache_size -= DELTA_SIZE(entry); delta_cache_size += entry->z_delta_size; cache_unlock(); } else { diff --git a/pack-objects.h b/pack-objects.h index ee2c7ab382b..1c588184b28 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -12,6 +12,7 @@ * above this limit. Don't lower it too much. */ #define OE_SIZE_BITS 31 +#define OE_DELTA_SIZE_BITS 20 /* * State flags for depth-first search used for analyzing delta cycles. @@ -85,7 +86,8 @@ struct object_entry { * uses the same base as me */ void *delta_data; /* cached delta (uncompressed) */ - unsigned long delta_size; /* delta data size (uncompressed) */ + unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */ + unsigned delta_size_valid:1; unsigned z_delta_size:OE_Z_DELTA_BITS; unsigned type_:TYPE_BITS; unsigned in_pack_type:TYPE_BITS; /* could be delta */ @@ -309,4 +311,23 @@ static inline void oe_set_size(struct packing_data *pack, } } +static inline unsigned long oe_delta_size(struct packing_data *pack, + const struct object_entry *e) +{ + if (e->delta_size_valid) + return e->delta_size_; + return oe_size(pack, e); +} + +static inline void oe_set_delta_size(struct packing_data *pack, + struct object_entry *e, + unsigned long size) +{ + e->delta_size_ = size; + e->delta_size_valid = e->delta_size_ == size; + if (!e->delta_size_valid && size != oe_size(pack, e)) + BUG("this can only happen in check_object() " + "where delta size is the same as entry size"); +} + #endif -- cgit v1.2.3 From 3b13a5f263872c4643f7f0eb6eccb7a8196b2760 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:12 +0200 Subject: pack-objects: reorder members to shrink struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous patches leave lots of holes and padding in this struct. This patch reorders the members and shrinks the struct down to 80 bytes (from 136 bytes on 64-bit systems, before any field shrinking is done) with 16 bits to spare (and a couple more in in_pack_header_size when we really run out of bits). This is the last in a series of memory reduction patches (see "pack-objects: a bit of document about struct object_entry" for the first one). Overall they've reduced repack memory size on linux-2.6.git from 3.747G to 3.424G, or by around 320M, a decrease of 8.5%. The runtime of repack has stayed the same throughout this series. Ævar's testing on a big monorepo he has access to (bigger than linux-2.6.git) has shown a 7.9% reduction, so the overall expected improvement should be somewhere around 8%. See 87po42cwql.fsf@evledraar.gmail.com on-list (https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/) for more detailed numbers and a test script used to produce the numbers cited above. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/pack-objects.h b/pack-objects.h index 1c588184b28..e5456c6c891 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -28,6 +28,10 @@ enum dfs_state { }; /* + * The size of struct nearly determines pack-objects's memory + * consumption. This struct is packed tight for that reason. When you + * add or reorder something in this struct, think a bit about this. + * * basic object info * ----------------- * idx.oid is filled up before delta searching starts. idx.crc32 is @@ -76,34 +80,44 @@ enum dfs_state { */ struct object_entry { struct pack_idx_entry idx; + void *delta_data; /* cached delta (uncompressed) */ + off_t in_pack_offset; + uint32_t hash; /* name hint hash */ unsigned size_:OE_SIZE_BITS; unsigned size_valid:1; - unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ - off_t in_pack_offset; uint32_t delta_idx; /* delta base object */ uint32_t delta_child_idx; /* deltified objects who bases me */ uint32_t delta_sibling_idx; /* other deltified objects who * uses the same base as me */ - void *delta_data; /* cached delta (uncompressed) */ unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */ unsigned delta_size_valid:1; + unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ unsigned z_delta_size:OE_Z_DELTA_BITS; + unsigned type_valid:1; unsigned type_:TYPE_BITS; + unsigned no_try_delta:1; unsigned in_pack_type:TYPE_BITS; /* could be delta */ - unsigned type_valid:1; - uint32_t hash; /* name hint hash */ - unsigned char in_pack_header_size; unsigned preferred_base:1; /* * we do not pack this, but is available * to be used as the base object to delta * objects against. */ - unsigned no_try_delta:1; unsigned tagged:1; /* near the very tip of refs */ unsigned filled:1; /* assigned write-order */ unsigned dfs_state:OE_DFS_STATE_BITS; + unsigned char in_pack_header_size; unsigned depth:OE_DEPTH_BITS; + + /* + * pahole results on 64-bit linux (gcc and clang) + * + * size: 80, bit_padding: 20 bits, holes: 8 bits + * + * and on 32-bit (gcc) + * + * size: 76, bit_padding: 20 bits, holes: 8 bits + */ }; struct packing_data { -- cgit v1.2.3 From f6a5576d521693963092dd69355d8e96ecd73635 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:13 +0200 Subject: ci: exercise the whole test suite with uncommon code in pack-objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some recent optimizations have been added to pack-objects to reduce memory usage and some code paths are split into two: one for common use cases and one for rare ones. Make sure the rare cases are tested with Travis since it requires manual test configuration that is unlikely to be done by developers. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- ci/run-build-and-tests.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh index 8190a753de3..4b04c75b7f8 100755 --- a/ci/run-build-and-tests.sh +++ b/ci/run-build-and-tests.sh @@ -11,7 +11,10 @@ make --jobs=2 make --quiet test if test "$jobname" = "linux-gcc" then - GIT_TEST_SPLIT_INDEX=yes make --quiet test + export GIT_TEST_SPLIT_INDEX=yes + export GIT_TEST_FULL_IN_PACK_ARRAY=true + export GIT_TEST_OE_SIZE=10 + make --quiet test fi check_unignored_build_artifacts -- cgit v1.2.3