Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/libgit2.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRussell Belfer <rb@github.com>2013-02-15 05:25:10 +0400
committerRussell Belfer <rb@github.com>2013-02-21 03:09:40 +0400
commit5e5848eb15cc0dd8476d1c6882a9f770e6556586 (patch)
tree953fd30d6360b67c2174b6c03fd2984561c84cf6 /tests-clar/core
parent99ba8f2322eaa2df51ace9782b8eadc8c5a6e8b8 (diff)
Change similarity metric to sampled hashes
This moves the similarity metric code out of buf_text and into a new file. Also, this implements a different approach to similarity measurement based on a Rabin-Karp rolling hash where we only keep the top 100 and bottom 100 hashes. In theory, that should be sufficient samples to given a fairly accurate measurement while limiting the amount of data we keep for file signatures no matter how large the file is.
Diffstat (limited to 'tests-clar/core')
-rw-r--r--tests-clar/core/buffer.c114
1 files changed, 60 insertions, 54 deletions
diff --git a/tests-clar/core/buffer.c b/tests-clar/core/buffer.c
index 63753bb67..6cad05c00 100644
--- a/tests-clar/core/buffer.c
+++ b/tests-clar/core/buffer.c
@@ -1,6 +1,7 @@
#include "clar_libgit2.h"
#include "buffer.h"
#include "buf_text.h"
+#include "hashsig.h"
#include "fileops.h"
#define TESTSTR "Have you seen that? Have you seeeen that??"
@@ -732,89 +733,94 @@ void test_core_buffer__classify_with_utf8(void)
cl_assert(git_buf_text_contains_nul(&b));
}
+#define SIMILARITY_TEST_DATA_1 \
+ "test data\nright here\ninline\ntada\nneeds more data\nlots of data\n" \
+ "is this enough?\nthere has to be enough data to fill the hash array!\n" \
+ "Apparently 191 bytes is the minimum amount of data needed.\nHere goes!\n" \
+ "Let's make sure we've got plenty to go with here.\n smile \n"
+
void test_core_buffer__similarity_metric(void)
{
- git_buf_text_hashsig *a, *b;
+ git_hashsig *a, *b;
git_buf buf = GIT_BUF_INIT;
int sim;
/* in the first case, we compare data to itself and expect 100% match */
- cl_git_pass(git_buf_sets(&buf, "test data\nright here\ninline\ntada"));
- cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
- cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+ cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+ cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+ cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
- cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100));
+ cl_assert_equal_i(100, git_hashsig_compare(a, b));
- git_buf_text_hashsig_free(a);
- git_buf_text_hashsig_free(b);
+ git_hashsig_free(a);
+ git_hashsig_free(b);
- /* in the second case, half of a is matched and all of b is matched, so
- * we'll expect a score of around 66% to be the similarity score
- */
+ /* if we change just a single byte, how much does that change magnify? */
- cl_git_pass(
- git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
- cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+ cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+ cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+ cl_git_pass(git_buf_sets(&buf,
+ "Test data\nright here\ninline\ntada\nneeds more data\nlots of data\n"
+ "is this enough?\nthere has to be enough data to fill the hash array!\n"
+ "Apparently 191 bytes is the minimum amount of data needed.\nHere goes!\n"
+ "Let's make sure we've got plenty to go with here.\n smile \n"));
+ cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
- cl_git_pass(git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh"));
- cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+ sim = git_hashsig_compare(a, b);
- sim = git_buf_text_hashsig_compare(a, b, 100);
- cl_assert(sim > 60 && sim < 70);
+ cl_assert(95 < sim && sim < 100); /* expect >95% similarity */
- git_buf_text_hashsig_free(a);
- git_buf_text_hashsig_free(b);
+ git_hashsig_free(a);
+ git_hashsig_free(b);
- /* in the reversed case, 100% of line hashes match, but no pairwise hashes
- * match, so we'll expect about a 50% match for a reversed file
- */
+ /* let's try comparing data to a superset of itself */
- cl_git_pass(
- git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
- cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
- cl_git_pass(
- git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n"));
- cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+ cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+ cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+ cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1
+ "and if I add some more, it should still be pretty similar, yes?\n"));
+ cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
- sim = git_buf_text_hashsig_compare(a, b, 100);
- cl_assert(sim > 45 && sim < 55);
+ sim = git_hashsig_compare(a, b);
- git_buf_text_hashsig_free(a);
- git_buf_text_hashsig_free(b);
+ cl_assert(70 < sim && sim < 80); /* expect in the 70-80% similarity range */
- /* if we don't use pairwise signatures, then a reversed file should
- * match 100%
- */
+ git_hashsig_free(a);
+ git_hashsig_free(b);
+
+ /* what if we keep about half the original data and add half new */
+
+ cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+ cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+ cl_git_pass(git_buf_sets(&buf,
+ "test data\nright here\ninline\ntada\nneeds more data\nlots of data\n"
+ "is this enough?\nthere has to be enough data to fill the hash array!\n"
+ "okay, that's half the original\nwhat else can we add?\nmore data\n"
+ "one more line will complete this\nshort\nlines\ndon't\nmatter\n"));
+ cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
- cl_git_pass(
- git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
- cl_git_pass(git_buf_text_hashsig_create(&a, &buf, false));
- cl_git_pass(
- git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n"));
- cl_git_pass(git_buf_text_hashsig_create(&b, &buf, false));
+ sim = git_hashsig_compare(a, b);
- sim = git_buf_text_hashsig_compare(a, b, 100);
- cl_assert_equal_i(100, sim);
+ cl_assert(40 < sim && sim < 60); /* expect in the 40-60% similarity range */
- git_buf_text_hashsig_free(a);
- git_buf_text_hashsig_free(b);
+ git_hashsig_free(a);
+ git_hashsig_free(b);
/* lastly, let's check that we can hash file content as well */
- cl_git_pass(
- git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
- cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+ cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+ cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
cl_git_pass(git_futils_mkdir("scratch", NULL, 0755, GIT_MKDIR_PATH));
- cl_git_mkfile("scratch/testdata",
- "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n");
- cl_git_pass(git_buf_text_hashsig_create_fromfile(&b, "scratch/testdata", true));
+ cl_git_mkfile("scratch/testdata", SIMILARITY_TEST_DATA_1);
+ cl_git_pass(git_hashsig_create_fromfile(
+ &b, "scratch/testdata", GIT_HASHSIG_NORMAL));
- cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100));
+ cl_assert_equal_i(100, git_hashsig_compare(a, b));
- git_buf_text_hashsig_free(a);
- git_buf_text_hashsig_free(b);
+ git_hashsig_free(a);
+ git_hashsig_free(b);
git_buf_free(&buf);
git_futils_rmdir_r("scratch", NULL, GIT_RMDIR_REMOVE_FILES);