diff options
author | Russell Belfer <rb@github.com> | 2013-01-12 10:13:02 +0400 |
---|---|---|
committer | Russell Belfer <rb@github.com> | 2013-02-21 03:09:40 +0400 |
commit | 9c454b007b57669e7baf2b8b69cf053f32a620a2 (patch) | |
tree | 50e4e8a799f62469667d9140c043421cd3e1d73a /tests-clar/core | |
parent | f2e1d06064696a07c917dbcce0dd04c1cbfe32e9 (diff) |
Initial implementation of similarity scoring algo
This adds a new `git_buf_text_hashsig` type and functions to
generate these hash signatures and compare them to give a
similarity score. This can be plugged into diff similarity
scoring.
Diffstat (limited to 'tests-clar/core')
-rw-r--r-- | tests-clar/core/buffer.c | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/tests-clar/core/buffer.c b/tests-clar/core/buffer.c index 49ab41f71..63753bb67 100644 --- a/tests-clar/core/buffer.c +++ b/tests-clar/core/buffer.c @@ -1,6 +1,7 @@ #include "clar_libgit2.h" #include "buffer.h" #include "buf_text.h" +#include "fileops.h" #define TESTSTR "Have you seen that? Have you seeeen that??" const char *test_string = TESTSTR; @@ -730,3 +731,91 @@ void test_core_buffer__classify_with_utf8(void) cl_assert(git_buf_text_is_binary(&b)); cl_assert(git_buf_text_contains_nul(&b)); } + +void test_core_buffer__similarity_metric(void) +{ + git_buf_text_hashsig *a, *b; + git_buf buf = GIT_BUF_INIT; + int sim; + + /* in the first case, we compare data to itself and expect 100% match */ + + cl_git_pass(git_buf_sets(&buf, "test data\nright here\ninline\ntada")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true)); + + cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100)); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + /* in the second case, half of a is matched and all of b is matched, so + * we'll expect a score of around 66% to be the similarity score + */ + + cl_git_pass( + git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + + cl_git_pass(git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh")); + cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true)); + + sim = git_buf_text_hashsig_compare(a, b, 100); + cl_assert(sim > 60 && sim < 70); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + /* in the reversed case, 100% of line hashes match, but no pairwise hashes + * match, so we'll expect about a 50% match for a reversed file + */ + + cl_git_pass( + git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + cl_git_pass( + git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n")); + cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true)); + + sim = git_buf_text_hashsig_compare(a, b, 100); + cl_assert(sim > 45 && sim < 55); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + /* if we don't use pairwise signatures, then a reversed file should + * match 100% + */ + + cl_git_pass( + git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, false)); + cl_git_pass( + git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n")); + cl_git_pass(git_buf_text_hashsig_create(&b, &buf, false)); + + sim = git_buf_text_hashsig_compare(a, b, 100); + cl_assert_equal_i(100, sim); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + /* lastly, let's check that we can hash file content as well */ + + cl_git_pass( + git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); + cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + + cl_git_pass(git_futils_mkdir("scratch", NULL, 0755, GIT_MKDIR_PATH)); + cl_git_mkfile("scratch/testdata", + "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"); + cl_git_pass(git_buf_text_hashsig_create_fromfile(&b, "scratch/testdata", true)); + + cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100)); + + git_buf_text_hashsig_free(a); + git_buf_text_hashsig_free(b); + + git_buf_free(&buf); + git_futils_rmdir_r("scratch", NULL, GIT_RMDIR_REMOVE_FILES); +} |