From eb591e42fd48dda59c309beba6991e95d552034f Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Fri, 1 May 2020 15:30:18 +0000
Subject: bloom: fix whitespace around tab length

Fix alignment issues that were likely introduced due to an editor
using tab lengths of 4 instead of 8.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bloom.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'bloom.c')

diff --git a/bloom.c b/bloom.c
index dd9bab9bbd..2e3e0f5037 100644
--- a/bloom.c
+++ b/bloom.c
@@ -29,8 +29,8 @@ static inline unsigned char get_bitmask(uint32_t pos)
 }
 
 static int load_bloom_filter_from_graph(struct commit_graph *g,
-				   struct bloom_filter *filter,
-				   struct commit *c)
+					struct bloom_filter *filter,
+					struct commit *c)
 {
 	uint32_t lex_pos, start_index, end_index;
 
@@ -123,9 +123,9 @@ uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
 }
 
 void fill_bloom_key(const char *data,
-					size_t len,
-					struct bloom_key *key,
-					const struct bloom_filter_settings *settings)
+		    size_t len,
+		    struct bloom_key *key,
+		    const struct bloom_filter_settings *settings)
 {
 	int i;
 	const uint32_t seed0 = 0x293ae76f;
@@ -139,8 +139,8 @@ void fill_bloom_key(const char *data,
 }
 
 void add_key_to_filter(const struct bloom_key *key,
-					   struct bloom_filter *filter,
-					   const struct bloom_filter_settings *settings)
+		       struct bloom_filter *filter,
+		       const struct bloom_filter_settings *settings)
 {
 	int i;
 	uint64_t mod = filter->len * BITS_PER_WORD;
@@ -160,7 +160,7 @@ void init_bloom_filters(void)
 
 struct bloom_filter *get_bloom_filter(struct repository *r,
 				      struct commit *c,
-					  int compute_if_not_present)
+				      int compute_if_not_present)
 {
 	struct bloom_filter *filter;
 	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
-- 
cgit v1.2.3


From 891c17c95497712126700cdf5cd887bdb0ff3d55 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Mon, 11 May 2020 11:56:10 +0000
Subject: bloom: parse commit before computing filters

When computing changed-path Bloom filters for a commit, we need to
know if the commit has a parent or not. If the commit is not parsed,
then its parent pointer will be NULL.

As far as I can tell, the only opportunity to reach this code
without parsing the commit is inside "test-tool bloom
get_filter_for_commit" but it is best to be safe.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bloom.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'bloom.c')

diff --git a/bloom.c b/bloom.c
index 2e3e0f5037..9679278271 100644
--- a/bloom.c
+++ b/bloom.c
@@ -193,6 +193,9 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 	diffopt.max_changes = max_changes;
 	diff_setup_done(&diffopt);
 
+	/* ensure commit is parsed so we have parent information */
+	repo_parse_commit(r, c);
+
 	if (c->parents)
 		diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
 	else
-- 
cgit v1.2.3


From 65c1a28bb60d1c17a672306c651bb402378f81b0 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Mon, 11 May 2020 11:56:12 +0000
Subject: bloom: de-duplicate directory entries

When computing a changed-path Bloom filter, we need to take the
files that changed from the diff computation and extract the parent
directories. That way, a directory pathspec such as "Documentation"
could match commits that change "Documentation/git.txt".

However, the current code does a poor job of this process. The paths
are added to a hashmap, but we do not check if an entry already
exists with that path. This can create many duplicate entries and
cause the filter to have a much larger length than it should. This
means that the filter is more sparse than intended, which helps the
false positive rate, but wastes a lot of space.

Properly use hashmap_get() before hashmap_add(). Also be sure to
include a comparison function so these can be matched correctly.

This has an effect on a test in t0095-bloom.sh. This makes sense,
there are ten changes inside "smallDir" so the total number of
paths in the filter should be 11. This would result in 11 * 10 bits
required, and with 8 bits per byte, this results in 14 bytes.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bloom.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

(limited to 'bloom.c')

diff --git a/bloom.c b/bloom.c
index 9679278271..196cda0a1b 100644
--- a/bloom.c
+++ b/bloom.c
@@ -158,6 +158,19 @@ void init_bloom_filters(void)
 	init_bloom_filter_slab(&bloom_filters);
 }
 
+static int pathmap_cmp(const void *hashmap_cmp_fn_data,
+		       const struct hashmap_entry *eptr,
+		       const struct hashmap_entry *entry_or_key,
+		       const void *keydata)
+{
+	const struct pathmap_hash_entry *e1, *e2;
+
+	e1 = container_of(eptr, const struct pathmap_hash_entry, entry);
+	e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry);
+
+	return strcmp(e1->path, e2->path);
+}
+
 struct bloom_filter *get_bloom_filter(struct repository *r,
 				      struct commit *c,
 				      int compute_if_not_present)
@@ -206,25 +219,29 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 		struct hashmap pathmap;
 		struct pathmap_hash_entry *e;
 		struct hashmap_iter iter;
-		hashmap_init(&pathmap, NULL, NULL, 0);
+		hashmap_init(&pathmap, pathmap_cmp, NULL, 0);
 
 		for (i = 0; i < diff_queued_diff.nr; i++) {
 			const char *path = diff_queued_diff.queue[i]->two->path;
 
 			/*
-			* Add each leading directory of the changed file, i.e. for
-			* 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
-			* the Bloom filter could be used to speed up commands like
-			* 'git log dir/subdir', too.
-			*
-			* Note that directories are added without the trailing '/'.
-			*/
+			 * Add each leading directory of the changed file, i.e. for
+			 * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
+			 * the Bloom filter could be used to speed up commands like
+			 * 'git log dir/subdir', too.
+			 *
+			 * Note that directories are added without the trailing '/'.
+			 */
 			do {
 				char *last_slash = strrchr(path, '/');
 
 				FLEX_ALLOC_STR(e, path, path);
 				hashmap_entry_init(&e->entry, strhash(path));
-				hashmap_add(&pathmap, &e->entry);
+
+				if (!hashmap_get(&pathmap, &e->entry, NULL))
+					hashmap_add(&pathmap, &e->entry);
+				else
+					free(e);
 
 				if (!last_slash)
 					last_slash = (char*)path;
-- 
cgit v1.2.3


From 2f6775f00c4b39b905e84c46b95e5a04017ec9ba Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Mon, 11 May 2020 11:56:13 +0000
Subject: bloom: use num_changes not nr for limit detection

As diff_tree_oid() computes a diff, it will terminate early if the
total number of changed paths is strictly larger than max_changes.
This includes the directories that changed, not just the file paths.
However, only the file paths are reflected in the resulting diff
queue's "nr" value.

Use the "num_changes" from diffopt to check if the diff terminated
early. This is incredibly important, as it can result in incorrect
filters! For example, the first commit in the Linux kernel repo
reports only 471 changes, but since these are nested inside several
directories they expand to 513 "real" changes, and in fact the
total list of changes is not reported. Thus, the computed filter
for this commit is incorrect.

Demonstrate the subtle difference by using one fewer file change
in the 'get bloom filter for commit with 513 changes' test. Before,
this edited 513 files inside "bigDir" which hit this inequality.
However, dropping the file count by one demonstrates how the
previous inequality was incorrect but the new one is correct.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bloom.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'bloom.c')

diff --git a/bloom.c b/bloom.c
index 196cda0a1b..e2ede44126 100644
--- a/bloom.c
+++ b/bloom.c
@@ -215,7 +215,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
 
-	if (diff_queued_diff.nr <= max_changes) {
+	if (diffopt.num_changes <= max_changes) {
 		struct hashmap pathmap;
 		struct pathmap_hash_entry *e;
 		struct hashmap_iter iter;
-- 
cgit v1.2.3