From 16950f8384afa5106b1ce57da07a964c2aaef3f7 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 9 Feb 2021 05:53:50 -0500 Subject: rev-list: add --disk-usage option for calculating disk usage It can sometimes be useful to see which refs are contributing to the overall repository size (e.g., does some branch have a bunch of objects not found elsewhere in history, which indicates that deleting it would shrink the size of a clone). You can find that out by generating a list of objects, getting their sizes from cat-file, and then summing them, like: git rev-list --objects --no-object-names main..branch git cat-file --batch-check='%(objectsize:disk)' | perl -lne '$total += $_; END { print $total }' Though note that the caveats from git-cat-file(1) apply here. We "blame" base objects more than their deltas, even though the relationship could easily be flipped. Still, it can be a useful rough measure. But one problem is that it's slow to run. Teaching rev-list to sum up the sizes can be much faster for two reasons: 1. It skips all of the piping of object names and sizes. 2. If bitmaps are in use, for objects that are in the bitmapped packfile we can skip the oid_object_info() lookup entirely, and just ask the revindex for the on-disk size. This patch implements a --disk-usage option which produces the same answer in a fraction of the time. Here are some timings using a clone of torvalds/linux: [rev-list piped to cat-file, no bitmaps] $ time git rev-list --objects --no-object-names --all | git cat-file --buffer --batch-check='%(objectsize:disk)' | perl -lne '$total += $_; END { print $total }' 1459938510 real 0m29.635s user 0m38.003s sys 0m1.093s [internal, no bitmaps] $ time git rev-list --disk-usage --objects --all 1459938510 real 0m31.262s user 0m30.885s sys 0m0.376s Even though the wall-clock time is slightly worse due to parallelism, notice the CPU savings between the two. We saved 21% of the CPU just by avoiding the pipes. But the real win is with bitmaps. If we use them without the new option: [rev-list piped to cat-file, bitmaps] $ time git rev-list --objects --no-object-names --all --use-bitmap-index | git cat-file --batch-check='%(objectsize:disk)' | perl -lne '$total += $_; END { print $total }' 1459938510 real 0m6.244s user 0m8.452s sys 0m0.311s then we're faster to generate the list of objects, but we still spend a lot of time piping and looking things up. But if we do both together: [internal, bitmaps] $ time git rev-list --disk-usage --objects --all --use-bitmap-index 1459938510 real 0m0.219s user 0m0.169s sys 0m0.049s then we get the same answer much faster. For "--all", that answer will correspond closely to "du objects/pack", of course. But we're actually checking reachability here, so we're still fast when we ask for more interesting things: $ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10 374798628 real 0m0.429s user 0m0.356s sys 0m0.072s Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- pack-bitmap.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'pack-bitmap.c') diff --git a/pack-bitmap.c b/pack-bitmap.c index 60fe20fb87..1f69b5fa85 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -1430,3 +1430,84 @@ int bitmap_has_oid_in_uninteresting(struct bitmap_index *bitmap_git, return bitmap_git && bitmap_walk_contains(bitmap_git, bitmap_git->haves, oid); } + +static off_t get_disk_usage_for_type(struct bitmap_index *bitmap_git, + enum object_type object_type) +{ + struct bitmap *result = bitmap_git->result; + struct packed_git *pack = bitmap_git->pack; + off_t total = 0; + struct ewah_iterator it; + eword_t filter; + size_t i; + + init_type_iterator(&it, bitmap_git, object_type); + for (i = 0; i < result->word_alloc && + ewah_iterator_next(&filter, &it); i++) { + eword_t word = result->words[i] & filter; + size_t base = (i * BITS_IN_EWORD); + unsigned offset; + + if (!word) + continue; + + for (offset = 0; offset < BITS_IN_EWORD; offset++) { + size_t pos; + + if ((word >> offset) == 0) + break; + + offset += ewah_bit_ctz64(word >> offset); + pos = base + offset; + total += pack_pos_to_offset(pack, pos + 1) - + pack_pos_to_offset(pack, pos); + } + } + + return total; +} + +static off_t get_disk_usage_for_extended(struct bitmap_index *bitmap_git) +{ + struct bitmap *result = bitmap_git->result; + struct packed_git *pack = bitmap_git->pack; + struct eindex *eindex = &bitmap_git->ext_index; + off_t total = 0; + struct object_info oi = OBJECT_INFO_INIT; + off_t object_size; + size_t i; + + oi.disk_sizep = &object_size; + + for (i = 0; i < eindex->count; i++) { + struct object *obj = eindex->objects[i]; + + if (!bitmap_get(result, pack->num_objects + i)) + continue; + + if (oid_object_info_extended(the_repository, &obj->oid, &oi, 0) < 0) + die(_("unable to get disk usage of %s"), + oid_to_hex(&obj->oid)); + + total += object_size; + } + return total; +} + +off_t get_disk_usage_from_bitmap(struct bitmap_index *bitmap_git, + struct rev_info *revs) +{ + off_t total = 0; + + total += get_disk_usage_for_type(bitmap_git, OBJ_COMMIT); + if (revs->tree_objects) + total += get_disk_usage_for_type(bitmap_git, OBJ_TREE); + if (revs->blob_objects) + total += get_disk_usage_for_type(bitmap_git, OBJ_BLOB); + if (revs->tag_objects) + total += get_disk_usage_for_type(bitmap_git, OBJ_TAG); + + total += get_disk_usage_for_extended(bitmap_git); + + return total; +} -- cgit v1.2.3