From eb591e42fd48dda59c309beba6991e95d552034f Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 1 May 2020 15:30:18 +0000 Subject: bloom: fix whitespace around tab length Fix alignment issues that were likely introduced due to an editor using tab lengths of 4 instead of 8. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bloom.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'bloom.c') diff --git a/bloom.c b/bloom.c index dd9bab9bbd..2e3e0f5037 100644 --- a/bloom.c +++ b/bloom.c @@ -29,8 +29,8 @@ static inline unsigned char get_bitmask(uint32_t pos) } static int load_bloom_filter_from_graph(struct commit_graph *g, - struct bloom_filter *filter, - struct commit *c) + struct bloom_filter *filter, + struct commit *c) { uint32_t lex_pos, start_index, end_index; @@ -123,9 +123,9 @@ uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len) } void fill_bloom_key(const char *data, - size_t len, - struct bloom_key *key, - const struct bloom_filter_settings *settings) + size_t len, + struct bloom_key *key, + const struct bloom_filter_settings *settings) { int i; const uint32_t seed0 = 0x293ae76f; @@ -139,8 +139,8 @@ void fill_bloom_key(const char *data, } void add_key_to_filter(const struct bloom_key *key, - struct bloom_filter *filter, - const struct bloom_filter_settings *settings) + struct bloom_filter *filter, + const struct bloom_filter_settings *settings) { int i; uint64_t mod = filter->len * BITS_PER_WORD; @@ -160,7 +160,7 @@ void init_bloom_filters(void) struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c, - int compute_if_not_present) + int compute_if_not_present) { struct bloom_filter *filter; struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; -- cgit v1.2.3 From 891c17c95497712126700cdf5cd887bdb0ff3d55 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 11 May 2020 11:56:10 +0000 Subject: bloom: parse commit before computing filters When computing changed-path Bloom filters for a commit, we need to know if the commit has a parent or not. If the commit is not parsed, then its parent pointer will be NULL. As far as I can tell, the only opportunity to reach this code without parsing the commit is inside "test-tool bloom get_filter_for_commit" but it is best to be safe. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bloom.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'bloom.c') diff --git a/bloom.c b/bloom.c index 2e3e0f5037..9679278271 100644 --- a/bloom.c +++ b/bloom.c @@ -193,6 +193,9 @@ struct bloom_filter *get_bloom_filter(struct repository *r, diffopt.max_changes = max_changes; diff_setup_done(&diffopt); + /* ensure commit is parsed so we have parent information */ + repo_parse_commit(r, c); + if (c->parents) diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt); else -- cgit v1.2.3 From 65c1a28bb60d1c17a672306c651bb402378f81b0 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 11 May 2020 11:56:12 +0000 Subject: bloom: de-duplicate directory entries When computing a changed-path Bloom filter, we need to take the files that changed from the diff computation and extract the parent directories. That way, a directory pathspec such as "Documentation" could match commits that change "Documentation/git.txt". However, the current code does a poor job of this process. The paths are added to a hashmap, but we do not check if an entry already exists with that path. This can create many duplicate entries and cause the filter to have a much larger length than it should. This means that the filter is more sparse than intended, which helps the false positive rate, but wastes a lot of space. Properly use hashmap_get() before hashmap_add(). Also be sure to include a comparison function so these can be matched correctly. This has an effect on a test in t0095-bloom.sh. This makes sense, there are ten changes inside "smallDir" so the total number of paths in the filter should be 11. This would result in 11 * 10 bits required, and with 8 bits per byte, this results in 14 bytes. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bloom.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'bloom.c') diff --git a/bloom.c b/bloom.c index 9679278271..196cda0a1b 100644 --- a/bloom.c +++ b/bloom.c @@ -158,6 +158,19 @@ void init_bloom_filters(void) init_bloom_filter_slab(&bloom_filters); } +static int pathmap_cmp(const void *hashmap_cmp_fn_data, + const struct hashmap_entry *eptr, + const struct hashmap_entry *entry_or_key, + const void *keydata) +{ + const struct pathmap_hash_entry *e1, *e2; + + e1 = container_of(eptr, const struct pathmap_hash_entry, entry); + e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry); + + return strcmp(e1->path, e2->path); +} + struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c, int compute_if_not_present) @@ -206,25 +219,29 @@ struct bloom_filter *get_bloom_filter(struct repository *r, struct hashmap pathmap; struct pathmap_hash_entry *e; struct hashmap_iter iter; - hashmap_init(&pathmap, NULL, NULL, 0); + hashmap_init(&pathmap, pathmap_cmp, NULL, 0); for (i = 0; i < diff_queued_diff.nr; i++) { const char *path = diff_queued_diff.queue[i]->two->path; /* - * Add each leading directory of the changed file, i.e. for - * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so - * the Bloom filter could be used to speed up commands like - * 'git log dir/subdir', too. - * - * Note that directories are added without the trailing '/'. - */ + * Add each leading directory of the changed file, i.e. for + * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so + * the Bloom filter could be used to speed up commands like + * 'git log dir/subdir', too. + * + * Note that directories are added without the trailing '/'. + */ do { char *last_slash = strrchr(path, '/'); FLEX_ALLOC_STR(e, path, path); hashmap_entry_init(&e->entry, strhash(path)); - hashmap_add(&pathmap, &e->entry); + + if (!hashmap_get(&pathmap, &e->entry, NULL)) + hashmap_add(&pathmap, &e->entry); + else + free(e); if (!last_slash) last_slash = (char*)path; -- cgit v1.2.3 From 2f6775f00c4b39b905e84c46b95e5a04017ec9ba Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 11 May 2020 11:56:13 +0000 Subject: bloom: use num_changes not nr for limit detection As diff_tree_oid() computes a diff, it will terminate early if the total number of changed paths is strictly larger than max_changes. This includes the directories that changed, not just the file paths. However, only the file paths are reflected in the resulting diff queue's "nr" value. Use the "num_changes" from diffopt to check if the diff terminated early. This is incredibly important, as it can result in incorrect filters! For example, the first commit in the Linux kernel repo reports only 471 changes, but since these are nested inside several directories they expand to 513 "real" changes, and in fact the total list of changes is not reported. Thus, the computed filter for this commit is incorrect. Demonstrate the subtle difference by using one fewer file change in the 'get bloom filter for commit with 513 changes' test. Before, this edited 513 files inside "bigDir" which hit this inequality. However, dropping the file count by one demonstrates how the previous inequality was incorrect but the new one is correct. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bloom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'bloom.c') diff --git a/bloom.c b/bloom.c index 196cda0a1b..e2ede44126 100644 --- a/bloom.c +++ b/bloom.c @@ -215,7 +215,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r, diff_tree_oid(NULL, &c->object.oid, "", &diffopt); diffcore_std(&diffopt); - if (diff_queued_diff.nr <= max_changes) { + if (diffopt.num_changes <= max_changes) { struct hashmap pathmap; struct pathmap_hash_entry *e; struct hashmap_iter iter; -- cgit v1.2.3