From 66f0c71073ee5fe1c9d12d2952305a4793d7b43f Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Thu, 14 Dec 2023 17:23:39 -0500 Subject: pack-objects: free packing_data in more places The pack-objects internals use a packing_data struct to track what objects are part of the pack(s) being formed. Since these structures contain allocated fields, failing to appropriately free() them results in a leak. Plug that leak by introducing a clear_packing_data() function, and call it in the appropriate spots. This is a fairly straightforward leak to plug, since none of the callers expect to read any values or have any references to parts of the address space being freed. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'midx.c') diff --git a/midx.c b/midx.c index 1d14661dad..778dd536c8 100644 --- a/midx.c +++ b/midx.c @@ -1603,8 +1603,13 @@ static int write_midx_internal(const char *object_dir, flags) < 0) { error(_("could not write multi-pack bitmap")); result = 1; + clear_packing_data(&pdata); + free(commits); goto cleanup; } + + clear_packing_data(&pdata); + free(commits); } /* * NOTE: Do not use ctx.entries beyond this point, since it might -- cgit v1.2.3 From fba68184b8de477934d1642ac1c311cfcf0f6041 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Thu, 14 Dec 2023 17:23:48 -0500 Subject: midx: factor out `fill_pack_info()` When selecting which packfiles will be written while generating a MIDX, the MIDX internals fill out a 'struct pack_info' with various pieces of book-keeping. Instead of filling out each field of the `pack_info` structure individually in each of the two spots that modify the array of such structures (`ctx->info`), extract a common routine that does this for us. This reduces the code duplication by a modest amount. But more importantly, it zero-initializes the structure before assigning values into it. This hardens us for a future change which will add additional fields to this structure which (until this patch) was not zero-initialized. As a result, any new fields added to the `pack_info` structure need only be updated in a single location, instead of at each spot within midx.c. There are no functional changes in this patch. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'midx.c') diff --git a/midx.c b/midx.c index 778dd536c8..8dba67ddbe 100644 --- a/midx.c +++ b/midx.c @@ -475,6 +475,17 @@ struct pack_info { unsigned expired : 1; }; +static void fill_pack_info(struct pack_info *info, + struct packed_git *p, const char *pack_name, + uint32_t orig_pack_int_id) +{ + memset(info, 0, sizeof(struct pack_info)); + + info->orig_pack_int_id = orig_pack_int_id; + info->pack_name = xstrdup(pack_name); + info->p = p; +} + static int pack_info_compare(const void *_a, const void *_b) { struct pack_info *a = (struct pack_info *)_a; @@ -515,6 +526,7 @@ static void add_pack_to_midx(const char *full_path, size_t full_path_len, const char *file_name, void *data) { struct write_midx_context *ctx = data; + struct packed_git *p; if (ends_with(file_name, ".idx")) { display_progress(ctx->progress, ++ctx->pack_paths_checked); @@ -541,27 +553,22 @@ static void add_pack_to_midx(const char *full_path, size_t full_path_len, ALLOC_GROW(ctx->info, ctx->nr + 1, ctx->alloc); - ctx->info[ctx->nr].p = add_packed_git(full_path, - full_path_len, - 0); - - if (!ctx->info[ctx->nr].p) { + p = add_packed_git(full_path, full_path_len, 0); + if (!p) { warning(_("failed to add packfile '%s'"), full_path); return; } - if (open_pack_index(ctx->info[ctx->nr].p)) { + if (open_pack_index(p)) { warning(_("failed to open pack-index '%s'"), full_path); - close_pack(ctx->info[ctx->nr].p); - FREE_AND_NULL(ctx->info[ctx->nr].p); + close_pack(p); + free(p); return; } - ctx->info[ctx->nr].pack_name = xstrdup(file_name); - ctx->info[ctx->nr].orig_pack_int_id = ctx->nr; - ctx->info[ctx->nr].expired = 0; + fill_pack_info(&ctx->info[ctx->nr], p, file_name, ctx->nr); ctx->nr++; } } @@ -1321,11 +1328,6 @@ static int write_midx_internal(const char *object_dir, for (i = 0; i < ctx.m->num_packs; i++) { ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc); - ctx.info[ctx.nr].orig_pack_int_id = i; - ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]); - ctx.info[ctx.nr].p = ctx.m->packs[i]; - ctx.info[ctx.nr].expired = 0; - if (flags & MIDX_WRITE_REV_INDEX) { /* * If generating a reverse index, need to have @@ -1341,10 +1343,10 @@ static int write_midx_internal(const char *object_dir, if (open_pack_index(ctx.m->packs[i])) die(_("could not open index for %s"), ctx.m->packs[i]->pack_name); - ctx.info[ctx.nr].p = ctx.m->packs[i]; } - ctx.nr++; + fill_pack_info(&ctx.info[ctx.nr++], ctx.m->packs[i], + ctx.m->pack_names[i], i); } } -- cgit v1.2.3 From 5f5ccd959573f88e126d53df16b149c64e6e9091 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Thu, 14 Dec 2023 17:23:51 -0500 Subject: midx: implement `BTMP` chunk When a multi-pack bitmap is used to implement verbatim pack reuse (that is, when verbatim chunks from an on-disk packfile are copied directly[^1]), it does so by using its "preferred pack" as the source for pack-reuse. This allows repositories to pack the majority of their objects into a single (often large) pack, and then use it as the single source for verbatim pack reuse. This increases the amount of objects that are reused verbatim (and consequently, decrease the amount of time it takes to generate many packs). But this performance comes at a cost, which is that the preferred packfile must pace its growth with that of the entire repository in order to maintain the utility of verbatim pack reuse. As repositories grow beyond what we can reasonably store in a single packfile, the utility of verbatim pack reuse diminishes. Or, at the very least, it becomes increasingly more expensive to maintain as the pack grows larger and larger. It would be beneficial to be able to perform this same optimization over multiple packs, provided some modest constraints (most importantly, that the set of packs eligible for verbatim reuse are disjoint with respect to the subset of their objects being sent). If we assume that the packs which we treat as candidates for verbatim reuse are disjoint with respect to any of their objects we may output, we need to make only modest modifications to the verbatim pack-reuse code itself. Most notably, we need to remove the assumption that the bits in the reachability bitmap corresponding to objects from the single reuse pack begin at the first bit position. Future patches will unwind these assumptions and reimplement their existing functionality as special cases of the more general assumptions (e.g. that reuse bits can start anywhere within the bitset, but happen to start at 0 for all existing cases). This patch does not yet relax any of those assumptions. Instead, it implements a foundational data-structure, the "Bitampped Packs" (`BTMP`) chunk of the multi-pack index. The `BTMP` chunk's contents are described in detail here. Importantly, the `BTMP` chunk contains information to map regions of a multi-pack index's reachability bitmap to the packs whose objects they represent. For now, this chunk is only written, not read (outside of the test-tool used in this patch to test the new chunk's behavior). Future patches will begin to make use of this new chunk. [^1]: Modulo patching any `OFS_DELTA`'s that cross over a region of the pack that wasn't used verbatim. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 3 deletions(-) (limited to 'midx.c') diff --git a/midx.c b/midx.c index 8dba67ddbe..de25612b0c 100644 --- a/midx.c +++ b/midx.c @@ -33,6 +33,7 @@ #define MIDX_CHUNK_ALIGNMENT 4 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */ +#define MIDX_CHUNKID_BITMAPPEDPACKS 0x42544d50 /* "BTMP" */ #define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */ #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */ #define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */ @@ -41,6 +42,7 @@ #define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256) #define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t)) #define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t)) +#define MIDX_CHUNK_BITMAPPED_PACKS_WIDTH (2 * sizeof(uint32_t)) #define MIDX_LARGE_OFFSET_NEEDED 0x80000000 #define PACK_EXPIRED UINT_MAX @@ -193,6 +195,9 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local pair_chunk(cf, MIDX_CHUNKID_LARGEOFFSETS, &m->chunk_large_offsets, &m->chunk_large_offsets_len); + pair_chunk(cf, MIDX_CHUNKID_BITMAPPEDPACKS, + (const unsigned char **)&m->chunk_bitmapped_packs, + &m->chunk_bitmapped_packs_len); if (git_env_bool("GIT_TEST_MIDX_READ_RIDX", 1)) pair_chunk(cf, MIDX_CHUNKID_REVINDEX, &m->chunk_revindex, @@ -286,6 +291,26 @@ int prepare_midx_pack(struct repository *r, struct multi_pack_index *m, uint32_t return 0; } +int nth_bitmapped_pack(struct repository *r, struct multi_pack_index *m, + struct bitmapped_pack *bp, uint32_t pack_int_id) +{ + if (!m->chunk_bitmapped_packs) + return error(_("MIDX does not contain the BTMP chunk")); + + if (prepare_midx_pack(r, m, pack_int_id)) + return error(_("could not load bitmapped pack %"PRIu32), pack_int_id); + + bp->p = m->packs[pack_int_id]; + bp->bitmap_pos = get_be32((char *)m->chunk_bitmapped_packs + + MIDX_CHUNK_BITMAPPED_PACKS_WIDTH * pack_int_id); + bp->bitmap_nr = get_be32((char *)m->chunk_bitmapped_packs + + MIDX_CHUNK_BITMAPPED_PACKS_WIDTH * pack_int_id + + sizeof(uint32_t)); + bp->pack_int_id = pack_int_id; + + return 0; +} + int bsearch_midx(const struct object_id *oid, struct multi_pack_index *m, uint32_t *result) { return bsearch_hash(oid->hash, m->chunk_oid_fanout, m->chunk_oid_lookup, @@ -468,10 +493,16 @@ static size_t write_midx_header(struct hashfile *f, return MIDX_HEADER_SIZE; } +#define BITMAP_POS_UNKNOWN (~((uint32_t)0)) + struct pack_info { uint32_t orig_pack_int_id; char *pack_name; struct packed_git *p; + + uint32_t bitmap_pos; + uint32_t bitmap_nr; + unsigned expired : 1; }; @@ -484,6 +515,7 @@ static void fill_pack_info(struct pack_info *info, info->orig_pack_int_id = orig_pack_int_id; info->pack_name = xstrdup(pack_name); info->p = p; + info->bitmap_pos = BITMAP_POS_UNKNOWN; } static int pack_info_compare(const void *_a, const void *_b) @@ -824,6 +856,26 @@ static int write_midx_pack_names(struct hashfile *f, void *data) return 0; } +static int write_midx_bitmapped_packs(struct hashfile *f, void *data) +{ + struct write_midx_context *ctx = data; + size_t i; + + for (i = 0; i < ctx->nr; i++) { + struct pack_info *pack = &ctx->info[i]; + if (pack->expired) + continue; + + if (pack->bitmap_pos == BITMAP_POS_UNKNOWN && pack->bitmap_nr) + BUG("pack '%s' has no bitmap position, but has %d bitmapped object(s)", + pack->pack_name, pack->bitmap_nr); + + hashwrite_be32(f, pack->bitmap_pos); + hashwrite_be32(f, pack->bitmap_nr); + } + return 0; +} + static int write_midx_oid_fanout(struct hashfile *f, void *data) { @@ -991,8 +1043,19 @@ static uint32_t *midx_pack_order(struct write_midx_context *ctx) QSORT(data, ctx->entries_nr, midx_pack_order_cmp); ALLOC_ARRAY(pack_order, ctx->entries_nr); - for (i = 0; i < ctx->entries_nr; i++) + for (i = 0; i < ctx->entries_nr; i++) { + struct pack_midx_entry *e = &ctx->entries[data[i].nr]; + struct pack_info *pack = &ctx->info[ctx->pack_perm[e->pack_int_id]]; + if (pack->bitmap_pos == BITMAP_POS_UNKNOWN) + pack->bitmap_pos = i; + pack->bitmap_nr++; pack_order[i] = data[i].nr; + } + for (i = 0; i < ctx->nr; i++) { + struct pack_info *pack = &ctx->info[ctx->pack_perm[i]]; + if (pack->bitmap_pos == BITMAP_POS_UNKNOWN) + pack->bitmap_pos = 0; + } free(data); trace2_region_leave("midx", "midx_pack_order", the_repository); @@ -1293,6 +1356,7 @@ static int write_midx_internal(const char *object_dir, struct hashfile *f = NULL; struct lock_file lk; struct write_midx_context ctx = { 0 }; + int bitmapped_packs_concat_len = 0; int pack_name_concat_len = 0; int dropped_packs = 0; int result = 0; @@ -1505,8 +1569,10 @@ static int write_midx_internal(const char *object_dir, } for (i = 0; i < ctx.nr; i++) { - if (!ctx.info[i].expired) - pack_name_concat_len += strlen(ctx.info[i].pack_name) + 1; + if (ctx.info[i].expired) + continue; + pack_name_concat_len += strlen(ctx.info[i].pack_name) + 1; + bitmapped_packs_concat_len += 2 * sizeof(uint32_t); } /* Check that the preferred pack wasn't expired (if given). */ @@ -1566,6 +1632,9 @@ static int write_midx_internal(const char *object_dir, add_chunk(cf, MIDX_CHUNKID_REVINDEX, st_mult(ctx.entries_nr, sizeof(uint32_t)), write_midx_revindex); + add_chunk(cf, MIDX_CHUNKID_BITMAPPEDPACKS, + bitmapped_packs_concat_len, + write_midx_bitmapped_packs); } write_midx_header(f, get_num_chunks(cf), ctx.nr - dropped_packs); -- cgit v1.2.3 From 307d75bbe6b33267872633173a5d9b5d88b87793 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Thu, 14 Dec 2023 17:23:54 -0500 Subject: midx: implement `midx_locate_pack()` The multi-pack index API exposes a `midx_contains_pack()` function that takes in a string ending in either ".idx" or ".pack" and returns whether or not the MIDX contains a given pack corresponding to that string. There is no corresponding function to locate the position of a pack within the MIDX's pack order (sorted lexically by pack filename). We could add an optional out parameter to `midx_contains_pack()` that is filled out with the pack's position when the parameter is non-NULL. To minimize the amount of fallout from this change, instead introduce a new function by renaming `midx_contains_pack()` to `midx_locate_pack()`, adding that output parameter, and then reimplementing `midx_contains_pack()` in terms of it. Future patches will make use of this new function. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'midx.c') diff --git a/midx.c b/midx.c index de25612b0c..beaf0c0de4 100644 --- a/midx.c +++ b/midx.c @@ -428,7 +428,8 @@ static int cmp_idx_or_pack_name(const char *idx_or_pack_name, return strcmp(idx_or_pack_name, idx_name); } -int midx_contains_pack(struct multi_pack_index *m, const char *idx_or_pack_name) +int midx_locate_pack(struct multi_pack_index *m, const char *idx_or_pack_name, + uint32_t *pos) { uint32_t first = 0, last = m->num_packs; @@ -439,8 +440,11 @@ int midx_contains_pack(struct multi_pack_index *m, const char *idx_or_pack_name) current = m->pack_names[mid]; cmp = cmp_idx_or_pack_name(idx_or_pack_name, current); - if (!cmp) + if (!cmp) { + if (pos) + *pos = mid; return 1; + } if (cmp > 0) { first = mid + 1; continue; @@ -451,6 +455,11 @@ int midx_contains_pack(struct multi_pack_index *m, const char *idx_or_pack_name) return 0; } +int midx_contains_pack(struct multi_pack_index *m, const char *idx_or_pack_name) +{ + return midx_locate_pack(m, idx_or_pack_name, NULL); +} + int prepare_multi_pack_index_one(struct repository *r, const char *object_dir, int local) { struct multi_pack_index *m; -- cgit v1.2.3 From b1e3333068247ddd44021a0b69457c249ddee7a1 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Thu, 14 Dec 2023 17:24:25 -0500 Subject: midx: implement `midx_preferred_pack()` When performing a binary search over the objects in a MIDX's bitmap (i.e. in pseudo-pack order), the reader reconstructs the pseudo-pack ordering using a combination of (a) the preferred pack, (b) the pack's lexical position in the MIDX based on pack names, and (c) the object offset within the pack. In order to perform this binary search, the reader must know the identity of the preferred pack. This could be stored in the MIDX, but isn't for historical reasons, mostly because it can easily be inferred at read-time by looking at the object in the first bit position and finding out which pack it was selected from in the MIDX, like so: nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0)); In midx_to_pack_pos() which performs this binary search, we look up the identity of the preferred pack before each search. This is relatively quick, since it involves two table-driven lookups (one in the MIDX's revindex for `pack_pos_to_midx()`, and another in the MIDX's object table for `nth_midxed_pack_int_id()`). But since the preferred pack does not change after the MIDX is written, it is safe to cache this value on the MIDX itself. Write a helper to do just that, and rewrite all of the existing call-sites that care about the identity of the preferred pack in terms of this new helper. This will prepare us for a subsequent patch where we will need to binary search through the MIDX's pseudo-pack order multiple times. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'midx.c') diff --git a/midx.c b/midx.c index beaf0c0de4..85e1c2cd12 100644 --- a/midx.c +++ b/midx.c @@ -21,6 +21,7 @@ #include "refs.h" #include "revision.h" #include "list-objects.h" +#include "pack-revindex.h" #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ #define MIDX_VERSION 1 @@ -177,6 +178,8 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local m->num_packs = get_be32(m->data + MIDX_BYTE_NUM_PACKS); + m->preferred_pack_idx = -1; + cf = init_chunkfile(NULL); if (read_table_of_contents(cf, m->data, midx_size, @@ -460,6 +463,23 @@ int midx_contains_pack(struct multi_pack_index *m, const char *idx_or_pack_name) return midx_locate_pack(m, idx_or_pack_name, NULL); } +int midx_preferred_pack(struct multi_pack_index *m, uint32_t *pack_int_id) +{ + if (m->preferred_pack_idx == -1) { + if (load_midx_revindex(m) < 0) { + m->preferred_pack_idx = -2; + return -1; + } + + m->preferred_pack_idx = + nth_midxed_pack_int_id(m, pack_pos_to_midx(m, 0)); + } else if (m->preferred_pack_idx == -2) + return -1; /* no revindex */ + + *pack_int_id = m->preferred_pack_idx; + return 0; +} + int prepare_multi_pack_index_one(struct repository *r, const char *object_dir, int local) { struct multi_pack_index *m; -- cgit v1.2.3