From 8d6ccce14fd1a5843f5c9b231d4dcb81f6ceeb25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:00 +0200 Subject: pack-objects: a bit of document about struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The role of this comment block becomes more important after we shuffle fields around to shrink this struct. It will be much harder to see what field is related to what. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index 03f1191659..c0a1f61aac 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -1,6 +1,51 @@ #ifndef PACK_OBJECTS_H #define PACK_OBJECTS_H +/* + * basic object info + * ----------------- + * idx.oid is filled up before delta searching starts. idx.crc32 is + * only valid after the object is written out and will be used for + * generating the index. idx.offset will be both gradually set and + * used in writing phase (base objects get offset first, then deltas + * refer to them) + * + * "size" is the uncompressed object size. Compressed size of the raw + * data for an object in a pack is not stored anywhere but is computed + * and made available when reverse .idx is made. + * + * "hash" contains a path name hash which is used for sorting the + * delta list and also during delta searching. Once prepare_pack() + * returns it's no longer needed. + * + * source pack info + * ---------------- + * The (in_pack, in_pack_offset) tuple contains the location of the + * object in the source pack. in_pack_header_size allows quickly + * skipping the header and going straight to the zlib stream. + * + * "type" and "in_pack_type" both describe object type. in_pack_type + * may contain a delta type, while type is always the canonical type. + * + * deltas + * ------ + * Delta links (delta, delta_child and delta_sibling) are created to + * reflect that delta graph from the source pack then updated or added + * during delta searching phase when we find better deltas. + * + * delta_child and delta_sibling are last needed in + * compute_write_order(). "delta" and "delta_size" must remain valid + * at object writing phase in case the delta is not cached. + * + * If a delta is cached in memory and is compressed, delta_data points + * to the data and z_delta_size contains the compressed size. If it's + * uncompressed [1], z_delta_size must be zero. delta_size is always + * the uncompressed size and must be valid even if the delta is not + * cached. + * + * [1] during try_delta phase we don't bother with compressing because + * the delta could be quickly replaced with a better one. + */ struct object_entry { struct pack_idx_entry idx; unsigned long size; /* uncompressed size */ -- cgit v1.2.3 From fd9b1baef8a940c9c251995b006a3d96f210e639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:01 +0200 Subject: pack-objects: turn type and in_pack_type to bitfields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An extra field type_valid is added to carry the equivalent of OBJ_BAD in the original "type" field. in_pack_type always contains a valid type so we only need 3 bits for it. A note about accepting OBJ_NONE as "valid" type. The function read_object_list_from_stdin() can pass this value [1] and it eventually calls create_object_entry() where current code skip setting "type" field if the incoming type is zero. This does not have any bad side effects because "type" field should be memset()'d anyway. But since we also need to set type_valid now, skipping oe_set_type() leaves type_valid zero/false, which will make oe_type() return OBJ_BAD, not OBJ_NONE anymore. Apparently we do care about OBJ_NONE in prepare_pack(). This switch from OBJ_NONE to OBJ_BAD may trigger fatal: unable to get type of object ... Accepting OBJ_NONE [2] does sound wrong, but this is how it is has been for a very long time and I haven't time to dig in further. [1] See 5c49c11686 (pack-objects: better check_object() performances - 2007-04-16) [2] 21666f1aae (convert object type handling from a string to a number - 2007-02-26) Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index c0a1f61aac..b4a83a6123 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -59,8 +59,9 @@ struct object_entry { void *delta_data; /* cached delta (uncompressed) */ unsigned long delta_size; /* delta data size (uncompressed) */ unsigned long z_delta_size; /* delta data size (compressed) */ - enum object_type type; - enum object_type in_pack_type; /* could be delta */ + unsigned type_:TYPE_BITS; + unsigned in_pack_type:TYPE_BITS; /* could be delta */ + unsigned type_valid:1; uint32_t hash; /* name hint hash */ unsigned int in_pack_pos; unsigned char in_pack_header_size; @@ -123,4 +124,19 @@ static inline uint32_t pack_name_hash(const char *name) return hash; } +static inline enum object_type oe_type(const struct object_entry *e) +{ + return e->type_valid ? e->type_ : OBJ_BAD; +} + +static inline void oe_set_type(struct object_entry *e, + enum object_type type) +{ + if (type >= OBJ_ANY) + BUG("OBJ_ANY cannot be set in pack-objects code"); + + e->type_valid = type >= OBJ_NONE; + e->type_ = (unsigned)type; +} + #endif -- cgit v1.2.3 From 0c6804ab4ee5cfa47fe28e0a2d20415c5c1f8884 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:02 +0200 Subject: pack-objects: use bitfield for object_entry::dfs_state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index b4a83a6123..080ef62d31 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -1,6 +1,21 @@ #ifndef PACK_OBJECTS_H #define PACK_OBJECTS_H +#define OE_DFS_STATE_BITS 2 + +/* + * State flags for depth-first search used for analyzing delta cycles. + * + * The depth is measured in delta-links to the base (so if A is a delta + * against B, then A has a depth of 1, and B a depth of 0). + */ +enum dfs_state { + DFS_NONE = 0, + DFS_ACTIVE, + DFS_DONE, + DFS_NUM_STATES +}; + /* * basic object info * ----------------- @@ -73,19 +88,10 @@ struct object_entry { unsigned no_try_delta:1; unsigned tagged:1; /* near the very tip of refs */ unsigned filled:1; /* assigned write-order */ + unsigned dfs_state:OE_DFS_STATE_BITS; - /* - * State flags for depth-first search used for analyzing delta cycles. - * - * The depth is measured in delta-links to the base (so if A is a delta - * against B, then A has a depth of 1, and B a depth of 0). - */ - enum { - DFS_NONE = 0, - DFS_ACTIVE, - DFS_DONE - } dfs_state; int depth; + }; struct packing_data { -- cgit v1.2.3 From b5c0cbd8083f71e071207fca0d5434c6db6ff6c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:03 +0200 Subject: pack-objects: use bitfield for object_entry::depth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because of struct packing from now on we can only handle max depth 4095 (or even lower when new booleans are added in this struct). This should be ok since long delta chain will cause significant slow down anyway. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index 080ef62d31..cdce1648de 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -2,6 +2,7 @@ #define PACK_OBJECTS_H #define OE_DFS_STATE_BITS 2 +#define OE_DEPTH_BITS 12 /* * State flags for depth-first search used for analyzing delta cycles. @@ -89,9 +90,7 @@ struct object_entry { unsigned tagged:1; /* near the very tip of refs */ unsigned filled:1; /* assigned write-order */ unsigned dfs_state:OE_DFS_STATE_BITS; - - int depth; - + unsigned depth:OE_DEPTH_BITS; }; struct packing_data { -- cgit v1.2.3 From 06af3bba414b832fe9e04fb959daa2b9b678d2d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:04 +0200 Subject: pack-objects: move in_pack_pos out of struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This field is only need for pack-bitmap, which is an optional feature. Move it to a separate array that is only allocated when pack-bitmap is used (like objects[], it is not freed, since we need it until the end of the process) Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index cdce1648de..71ea992c3c 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -79,7 +79,6 @@ struct object_entry { unsigned in_pack_type:TYPE_BITS; /* could be delta */ unsigned type_valid:1; uint32_t hash; /* name hint hash */ - unsigned int in_pack_pos; unsigned char in_pack_header_size; unsigned preferred_base:1; /* * we do not pack this, but is available @@ -99,6 +98,8 @@ struct packing_data { int32_t *index; uint32_t index_size; + + unsigned int *in_pack_pos; }; struct object_entry *packlist_alloc(struct packing_data *pdata, @@ -144,4 +145,17 @@ static inline void oe_set_type(struct object_entry *e, e->type_ = (unsigned)type; } +static inline unsigned int oe_in_pack_pos(const struct packing_data *pack, + const struct object_entry *e) +{ + return pack->in_pack_pos[e - pack->objects]; +} + +static inline void oe_set_in_pack_pos(const struct packing_data *pack, + const struct object_entry *e, + unsigned int pos) +{ + pack->in_pack_pos[e - pack->objects] = pos; +} + #endif -- cgit v1.2.3 From 43fa44fa3b68e6570145126892e1e43380d7bb5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:05 +0200 Subject: pack-objects: move in_pack out of struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using 8 bytes (on 64 bit arch) to store a pointer to a pack. Use an index instead since the number of packs should be relatively small. This limits the number of packs we can handle to 1k. Since we can't be sure people can never run into the situation where they have more than 1k pack files. Provide a fall back route for it. If we find out they have too many packs, the new in_pack_by_idx[] array (which has at most 1k elements) will not be used. Instead we allocate in_pack[] array that holds nr_objects elements. This is similar to how the optional in_pack_pos field is handled. The new simple test is just to make sure the too-many-packs code path is at least executed. The true test is running make test GIT_TEST_FULL_IN_PACK_ARRAY=1 to take advantage of other special case tests. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index 71ea992c3c..7bedd5af81 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -1,8 +1,11 @@ #ifndef PACK_OBJECTS_H #define PACK_OBJECTS_H +#include "object-store.h" + #define OE_DFS_STATE_BITS 2 #define OE_DEPTH_BITS 12 +#define OE_IN_PACK_BITS 10 /* * State flags for depth-first search used for analyzing delta cycles. @@ -65,7 +68,7 @@ enum dfs_state { struct object_entry { struct pack_idx_entry idx; unsigned long size; /* uncompressed size */ - struct packed_git *in_pack; /* already in pack */ + unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ off_t in_pack_offset; struct object_entry *delta; /* delta base object */ struct object_entry *delta_child; /* deltified objects who bases me */ @@ -100,8 +103,18 @@ struct packing_data { uint32_t index_size; unsigned int *in_pack_pos; + + /* + * Only one of these can be non-NULL and they have different + * sizes. if in_pack_by_idx is allocated, oe_in_pack() returns + * the pack of an object using in_pack_idx field. If not, + * in_pack[] array is used the same way as in_pack_pos[] + */ + struct packed_git **in_pack_by_idx; + struct packed_git **in_pack; }; +void prepare_packing_data(struct packing_data *pdata); struct object_entry *packlist_alloc(struct packing_data *pdata, const unsigned char *sha1, uint32_t index_pos); @@ -158,4 +171,27 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack, pack->in_pack_pos[e - pack->objects] = pos; } +static inline struct packed_git *oe_in_pack(const struct packing_data *pack, + const struct object_entry *e) +{ + if (pack->in_pack_by_idx) + return pack->in_pack_by_idx[e->in_pack_idx]; + else + return pack->in_pack[e - pack->objects]; +} + +void oe_map_new_pack(struct packing_data *pack, + struct packed_git *p); +static inline void oe_set_in_pack(struct packing_data *pack, + struct object_entry *e, + struct packed_git *p) +{ + if (!p->index) + oe_map_new_pack(pack, p); + if (pack->in_pack_by_idx) + e->in_pack_idx = p->index; + else + pack->in_pack[e - pack->objects] = p; +} + #endif -- cgit v1.2.3 From 898eba5e630696608ddca0ede489378e87464ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:06 +0200 Subject: pack-objects: refer to delta objects by index instead of pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These delta pointers always point to elements in the objects[] array in packing_data struct. We can only hold maximum 4G of those objects because the array size in nr_objects is uint32_t. We could use uint32_t indexes to address these elements instead of pointers. On 64-bit architecture (8 bytes per pointer) this would save 4 bytes per pointer. Convert these delta pointers to indexes. Since we need to handle NULL pointers as well, the index is shifted by one [1]. [1] This means we can only index 2^32-2 objects even though nr_objects could contain 2^32-1 objects. It should not be a problem in practice because when we grow objects[], nr_alloc would probably blow up long before nr_objects hits the wall. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 5 deletions(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index 7bedd5af81..e962dce3c0 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -70,11 +70,11 @@ struct object_entry { unsigned long size; /* uncompressed size */ unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ off_t in_pack_offset; - struct object_entry *delta; /* delta base object */ - struct object_entry *delta_child; /* deltified objects who bases me */ - struct object_entry *delta_sibling; /* other deltified objects who - * uses the same base as me - */ + uint32_t delta_idx; /* delta base object */ + uint32_t delta_child_idx; /* deltified objects who bases me */ + uint32_t delta_sibling_idx; /* other deltified objects who + * uses the same base as me + */ void *delta_data; /* cached delta (uncompressed) */ unsigned long delta_size; /* delta data size (uncompressed) */ unsigned long z_delta_size; /* delta data size (compressed) */ @@ -194,4 +194,61 @@ static inline void oe_set_in_pack(struct packing_data *pack, pack->in_pack[e - pack->objects] = p; } +static inline struct object_entry *oe_delta( + const struct packing_data *pack, + const struct object_entry *e) +{ + if (e->delta_idx) + return &pack->objects[e->delta_idx - 1]; + return NULL; +} + +static inline void oe_set_delta(struct packing_data *pack, + struct object_entry *e, + struct object_entry *delta) +{ + if (delta) + e->delta_idx = (delta - pack->objects) + 1; + else + e->delta_idx = 0; +} + +static inline struct object_entry *oe_delta_child( + const struct packing_data *pack, + const struct object_entry *e) +{ + if (e->delta_child_idx) + return &pack->objects[e->delta_child_idx - 1]; + return NULL; +} + +static inline void oe_set_delta_child(struct packing_data *pack, + struct object_entry *e, + struct object_entry *delta) +{ + if (delta) + e->delta_child_idx = (delta - pack->objects) + 1; + else + e->delta_child_idx = 0; +} + +static inline struct object_entry *oe_delta_sibling( + const struct packing_data *pack, + const struct object_entry *e) +{ + if (e->delta_sibling_idx) + return &pack->objects[e->delta_sibling_idx - 1]; + return NULL; +} + +static inline void oe_set_delta_sibling(struct packing_data *pack, + struct object_entry *e, + struct object_entry *delta) +{ + if (delta) + e->delta_sibling_idx = (delta - pack->objects) + 1; + else + e->delta_sibling_idx = 0; +} + #endif -- cgit v1.2.3 From 0cb3c1427a1e9cee638e0c1629933c907493d7e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:07 +0200 Subject: pack-objects: shrink z_delta_size field in struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We only cache deltas when it's smaller than a certain limit. This limit defaults to 1000 but save its compressed length in a 64-bit field. Shrink that field down to 20 bits, so you can only cache 1MB deltas. Larger deltas must be recomputed at when the pack is written down. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index e962dce3c0..9d0391c173 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -6,6 +6,7 @@ #define OE_DFS_STATE_BITS 2 #define OE_DEPTH_BITS 12 #define OE_IN_PACK_BITS 10 +#define OE_Z_DELTA_BITS 20 /* * State flags for depth-first search used for analyzing delta cycles. @@ -77,7 +78,7 @@ struct object_entry { */ void *delta_data; /* cached delta (uncompressed) */ unsigned long delta_size; /* delta data size (uncompressed) */ - unsigned long z_delta_size; /* delta data size (compressed) */ + unsigned z_delta_size:OE_Z_DELTA_BITS; unsigned type_:TYPE_BITS; unsigned in_pack_type:TYPE_BITS; /* could be delta */ unsigned type_valid:1; -- cgit v1.2.3 From 27a7d0679f17bd536f566e76e51058de0e1fa17a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:09 +0200 Subject: pack-objects: clarify the use of object_entry::size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While this field most of the time contains the canonical object size, there is one case it does not: when we have found that the base object of the delta in question is also to be packed, we will very happily reuse the delta by copying it over instead of regenerating the new delta. "size" in this case will record the delta size, not canonical object size. Later on in write_reuse_object(), we reconstruct the delta header and "size" is used for this purpose. When this happens, the "type" field contains a delta type instead of a canonical type. Highlight this in the code since it could be tricky to see. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index 9d0391c173..e4ea6a350c 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -32,7 +32,9 @@ enum dfs_state { * * "size" is the uncompressed object size. Compressed size of the raw * data for an object in a pack is not stored anywhere but is computed - * and made available when reverse .idx is made. + * and made available when reverse .idx is made. Note that when a + * delta is reused, "size" is the uncompressed _delta_ size, not the + * canonical one after the delta has been applied. * * "hash" contains a path name hash which is used for sorting the * delta list and also during delta searching. Once prepare_pack() -- cgit v1.2.3 From ac77d0c370dc5b23ac1ec4a13c754fe7ffa48564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:10 +0200 Subject: pack-objects: shrink size field in struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's very very rare that an uncompressed object is larger than 4GB (partly because Git does not handle those large files very well to begin with). Let's optimize it for the common case where object size is smaller than this limit. Shrink size field down to 31 bits and one overflow bit. If the size is too large, we read it back from disk. As noted in the previous patch, we need to return the delta size instead of canonical size when the to-be-reused object entry type is a delta instead of a canonical one. Add two compare helpers that can take advantage of the overflow bit (e.g. if the file is 4GB+, chances are it's already larger than core.bigFileThreshold and there's no point in comparing the actual value). Another note about oe_get_size_slow(). This function MUST be thread safe because SIZE() macro is used inside try_delta() which may run in parallel. Outside parallel code, no-contention locking should be dirt cheap (or insignificant compared to i/o access anyway). To exercise this code, it's best to run the test suite with something like make test GIT_TEST_OE_SIZE=4 which forces this code on all objects larger than 3 bytes. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index e4ea6a350c..ee2c7ab382 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -7,6 +7,11 @@ #define OE_DEPTH_BITS 12 #define OE_IN_PACK_BITS 10 #define OE_Z_DELTA_BITS 20 +/* + * Note that oe_set_size() becomes expensive when the given size is + * above this limit. Don't lower it too much. + */ +#define OE_SIZE_BITS 31 /* * State flags for depth-first search used for analyzing delta cycles. @@ -70,7 +75,8 @@ enum dfs_state { */ struct object_entry { struct pack_idx_entry idx; - unsigned long size; /* uncompressed size */ + unsigned size_:OE_SIZE_BITS; + unsigned size_valid:1; unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ off_t in_pack_offset; uint32_t delta_idx; /* delta base object */ @@ -115,6 +121,8 @@ struct packing_data { */ struct packed_git **in_pack_by_idx; struct packed_git **in_pack; + + uintmax_t oe_size_limit; }; void prepare_packing_data(struct packing_data *pdata); @@ -254,4 +262,51 @@ static inline void oe_set_delta_sibling(struct packing_data *pack, e->delta_sibling_idx = 0; } +unsigned long oe_get_size_slow(struct packing_data *pack, + const struct object_entry *e); +static inline unsigned long oe_size(struct packing_data *pack, + const struct object_entry *e) +{ + if (e->size_valid) + return e->size_; + + return oe_get_size_slow(pack, e); +} + +static inline int oe_size_less_than(struct packing_data *pack, + const struct object_entry *lhs, + unsigned long rhs) +{ + if (lhs->size_valid) + return lhs->size_ < rhs; + if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */ + return 0; + return oe_get_size_slow(pack, lhs) < rhs; +} + +static inline int oe_size_greater_than(struct packing_data *pack, + const struct object_entry *lhs, + unsigned long rhs) +{ + if (lhs->size_valid) + return lhs->size_ > rhs; + if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */ + return 1; + return oe_get_size_slow(pack, lhs) > rhs; +} + +static inline void oe_set_size(struct packing_data *pack, + struct object_entry *e, + unsigned long size) +{ + if (size < pack->oe_size_limit) { + e->size_ = size; + e->size_valid = 1; + } else { + e->size_valid = 0; + if (oe_get_size_slow(pack, e) != size) + BUG("'size' is supposed to be the object size!"); + } +} + #endif -- cgit v1.2.3 From 0aca34e8269514ebb67676e0470a314615494ae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:11 +0200 Subject: pack-objects: shrink delta_size field in struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allowing a delta size of 64 bits is crazy. Shrink this field down to 20 bits with one overflow bit. If we find an existing delta larger than 1MB, we do not cache delta_size at all and will get the value from oe_size(), potentially from disk if it's larger than 4GB. Note, since DELTA_SIZE() is used in try_delta() code, it must be thread-safe. Luckily oe_size() does guarantee this so we it is thread-safe. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index ee2c7ab382..1c588184b2 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -12,6 +12,7 @@ * above this limit. Don't lower it too much. */ #define OE_SIZE_BITS 31 +#define OE_DELTA_SIZE_BITS 20 /* * State flags for depth-first search used for analyzing delta cycles. @@ -85,7 +86,8 @@ struct object_entry { * uses the same base as me */ void *delta_data; /* cached delta (uncompressed) */ - unsigned long delta_size; /* delta data size (uncompressed) */ + unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */ + unsigned delta_size_valid:1; unsigned z_delta_size:OE_Z_DELTA_BITS; unsigned type_:TYPE_BITS; unsigned in_pack_type:TYPE_BITS; /* could be delta */ @@ -309,4 +311,23 @@ static inline void oe_set_size(struct packing_data *pack, } } +static inline unsigned long oe_delta_size(struct packing_data *pack, + const struct object_entry *e) +{ + if (e->delta_size_valid) + return e->delta_size_; + return oe_size(pack, e); +} + +static inline void oe_set_delta_size(struct packing_data *pack, + struct object_entry *e, + unsigned long size) +{ + e->delta_size_ = size; + e->delta_size_valid = e->delta_size_ == size; + if (!e->delta_size_valid && size != oe_size(pack, e)) + BUG("this can only happen in check_object() " + "where delta size is the same as entry size"); +} + #endif -- cgit v1.2.3 From 3b13a5f263872c4643f7f0eb6eccb7a8196b2760 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 14 Apr 2018 17:35:12 +0200 Subject: pack-objects: reorder members to shrink struct object_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous patches leave lots of holes and padding in this struct. This patch reorders the members and shrinks the struct down to 80 bytes (from 136 bytes on 64-bit systems, before any field shrinking is done) with 16 bits to spare (and a couple more in in_pack_header_size when we really run out of bits). This is the last in a series of memory reduction patches (see "pack-objects: a bit of document about struct object_entry" for the first one). Overall they've reduced repack memory size on linux-2.6.git from 3.747G to 3.424G, or by around 320M, a decrease of 8.5%. The runtime of repack has stayed the same throughout this series. Ævar's testing on a big monorepo he has access to (bigger than linux-2.6.git) has shown a 7.9% reduction, so the overall expected improvement should be somewhere around 8%. See 87po42cwql.fsf@evledraar.gmail.com on-list (https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/) for more detailed numbers and a test script used to produce the numbers cited above. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- pack-objects.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'pack-objects.h') diff --git a/pack-objects.h b/pack-objects.h index 1c588184b2..e5456c6c89 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -28,6 +28,10 @@ enum dfs_state { }; /* + * The size of struct nearly determines pack-objects's memory + * consumption. This struct is packed tight for that reason. When you + * add or reorder something in this struct, think a bit about this. + * * basic object info * ----------------- * idx.oid is filled up before delta searching starts. idx.crc32 is @@ -76,34 +80,44 @@ enum dfs_state { */ struct object_entry { struct pack_idx_entry idx; + void *delta_data; /* cached delta (uncompressed) */ + off_t in_pack_offset; + uint32_t hash; /* name hint hash */ unsigned size_:OE_SIZE_BITS; unsigned size_valid:1; - unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ - off_t in_pack_offset; uint32_t delta_idx; /* delta base object */ uint32_t delta_child_idx; /* deltified objects who bases me */ uint32_t delta_sibling_idx; /* other deltified objects who * uses the same base as me */ - void *delta_data; /* cached delta (uncompressed) */ unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */ unsigned delta_size_valid:1; + unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */ unsigned z_delta_size:OE_Z_DELTA_BITS; + unsigned type_valid:1; unsigned type_:TYPE_BITS; + unsigned no_try_delta:1; unsigned in_pack_type:TYPE_BITS; /* could be delta */ - unsigned type_valid:1; - uint32_t hash; /* name hint hash */ - unsigned char in_pack_header_size; unsigned preferred_base:1; /* * we do not pack this, but is available * to be used as the base object to delta * objects against. */ - unsigned no_try_delta:1; unsigned tagged:1; /* near the very tip of refs */ unsigned filled:1; /* assigned write-order */ unsigned dfs_state:OE_DFS_STATE_BITS; + unsigned char in_pack_header_size; unsigned depth:OE_DEPTH_BITS; + + /* + * pahole results on 64-bit linux (gcc and clang) + * + * size: 80, bit_padding: 20 bits, holes: 8 bits + * + * and on 32-bit (gcc) + * + * size: 76, bit_padding: 20 bits, holes: 8 bits + */ }; struct packing_data { -- cgit v1.2.3