Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/libgit2.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRussell Belfer <rb@github.com>2013-02-22 00:40:33 +0400
committerRussell Belfer <rb@github.com>2013-02-22 00:40:33 +0400
commit960a04dd56d89e94b5092be19ba9704b2d292dba (patch)
treef85c54d36d40f46b3ed31fecfa284f197973c6c3 /src/diff_tform.c
parent71a3d27ea686845811f04314d02798b4f1745046 (diff)
Initial integration of similarity metric to diff
This is the initial integration of the similarity metric into the `git_diff_find_similar()` code path. The existing tests all pass, but the new functionality isn't currently well tested. The integration does go through the pluggable metric interface, so it should be possible to drop in an alternative to the internal metric that libgit2 implements. This comes along with a behavior change for an existing interface; namely, passing two NULLs to git_diff_blobs (or passing NULLs to git_diff_blob_to_buffer) will now call the file_cb parameter zero times instead of one time. I know it's strange that that change is paired with this other change, but it emerged from some initialization changes that I ended up making.
Diffstat (limited to 'src/diff_tform.c')
-rw-r--r--src/diff_tform.c235
1 files changed, 178 insertions, 57 deletions
diff --git a/src/diff_tform.c b/src/diff_tform.c
index e051732c5..48332d3e5 100644
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -7,6 +7,7 @@
#include "common.h"
#include "diff.h"
#include "git2/config.h"
+#include "git2/blob.h"
#include "hashsig.h"
static git_diff_delta *diff_delta__dup(
@@ -362,24 +363,86 @@ on_error:
return -1;
}
-typedef struct {
- /* array of delta index * 2 + (old_file/new_file) -> file hashes */
- git_hashsig *sigs;
-} diff_similarity_cache;
+GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx)
+{
+ git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
+ return (idx & 1) ? &delta->new_file : &delta->old_file;
+}
-static unsigned int calc_similarity(
- void *ref, git_diff_file *old_file, git_diff_file *new_file)
+static int similarity_calc(
+ git_diff_list *diff,
+ git_diff_find_options *opts,
+ size_t file_idx,
+ void **cache)
{
- diff_similarity_cache *cache = ref;
+ int error = 0;
+ git_diff_file *file = similarity_get_file(diff, file_idx);
+ git_iterator_type_t src = (file_idx & 1) ? diff->old_src : diff->new_src;
+
+ if (src == GIT_ITERATOR_TYPE_WORKDIR) { /* compute hashsig from file */
+ git_buf path = GIT_BUF_INIT;
+
+ /* TODO: apply wd-to-odb filters to file data if necessary */
+
+ if (!(error = git_buf_joinpath(
+ &path, git_repository_workdir(diff->repo), file->path)))
+ error = opts->metric->file_signature(
+ &cache[file_idx], file, path.ptr, opts->metric->payload);
+
+ git_buf_free(&path);
+ } else { /* compute hashsig from blob buffer */
+ git_blob *blob = NULL;
- GIT_UNUSED(cache);
+ /* TODO: add max size threshold a la diff? */
- if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0)
+ if ((error = git_blob_lookup(&blob, diff->repo, &file->oid)) < 0)
+ return error;
+
+ error = opts->metric->buffer_signature(
+ &cache[file_idx], file, git_blob_rawcontent(blob),
+ git_blob_rawsize(blob), opts->metric->payload);
+
+ git_blob_free(blob);
+ }
+
+ return error;
+}
+
+static int similarity_measure(
+ git_diff_list *diff,
+ git_diff_find_options *opts,
+ void **cache,
+ size_t a_idx,
+ size_t b_idx)
+{
+ int score = 0;
+ git_diff_file *a_file = similarity_get_file(diff, a_idx);
+ git_diff_file *b_file = similarity_get_file(diff, b_idx);
+
+ if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
+ return 0;
+
+ if (git_oid_cmp(&a_file->oid, &b_file->oid) == 0)
return 100;
- /* TODO: insert actual similarity algo here */
+ /* update signature cache if needed */
+ if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0)
+ return -1;
+ if (!cache[b_idx] && similarity_calc(diff, opts, b_idx, cache) < 0)
+ return -1;
- return 0;
+ /* compare signatures */
+ if (opts->metric->similarity(
+ &score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
+ return -1;
+
+ /* clip score */
+ if (score < 0)
+ score = 0;
+ else if (score > 100)
+ score = 100;
+
+ return score;
}
#define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0)
@@ -388,67 +451,85 @@ int git_diff_find_similar(
git_diff_list *diff,
git_diff_find_options *given_opts)
{
- unsigned int i, j, similarity;
+ size_t i, j, cache_size, *matches;
+ int error = 0, similarity;
git_diff_delta *from, *to;
git_diff_find_options opts;
- unsigned int tried_targets, num_changes = 0;
- git_vector matches = GIT_VECTOR_INIT;
+ size_t tried_targets, num_rewrites = 0;
+ void **cache;
- if (normalize_find_opts(diff, &opts, given_opts) < 0)
- return -1;
+ if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
+ return error;
- /* first do splits if requested */
+ /* TODO: maybe abort if deltas.length > target_limit ??? */
+
+ cache_size = diff->deltas.length * 2; /* must store b/c length may change */
+ cache = git__calloc(cache_size, sizeof(void *));
+ GITERR_CHECK_ALLOC(cache);
+
+ matches = git__calloc(diff->deltas.length, sizeof(size_t));
+ GITERR_CHECK_ALLOC(matches);
+
+ /* first break MODIFIED records that are too different (if requested) */
if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) {
git_vector_foreach(&diff->deltas, i, from) {
if (from->status != GIT_DELTA_MODIFIED)
continue;
- /* Right now, this doesn't work right because the similarity
- * algorithm isn't actually implemented...
- */
- similarity = 100;
- /* calc_similarity(NULL, &from->old_file, from->new_file); */
+ similarity = similarity_measure(
+ diff, &opts, cache, 2 * i, 2 * i + 1);
- if (similarity < opts.break_rewrite_threshold) {
+ if (similarity < 0) {
+ error = similarity;
+ goto cleanup;
+ }
+
+ if ((unsigned int)similarity < opts.break_rewrite_threshold) {
from->flags |= GIT_DIFF_FLAG__TO_SPLIT;
- num_changes++;
+ num_rewrites++;
}
}
-
- /* apply splits as needed */
- if (num_changes > 0 &&
- apply_splits_and_deletes(
- diff, diff->deltas.length + num_changes) < 0)
- return -1;
}
/* next find the most similar delta for each rename / copy candidate */
- if (git_vector_init(&matches, diff->deltas.length, git_diff_delta__cmp) < 0)
- return -1;
-
git_vector_foreach(&diff->deltas, i, from) {
tried_targets = 0;
+ /* skip things that aren't blobs */
+ if (GIT_MODE_TYPE(from->old_file.mode) !=
+ GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+ continue;
+
git_vector_foreach(&diff->deltas, j, to) {
if (i == j)
continue;
+ /* skip things that aren't blobs */
+ if (GIT_MODE_TYPE(to->new_file.mode) !=
+ GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+ continue;
+
switch (to->status) {
case GIT_DELTA_ADDED:
case GIT_DELTA_UNTRACKED:
case GIT_DELTA_RENAMED:
case GIT_DELTA_COPIED:
break;
+ case GIT_DELTA_MODIFIED:
+ if ((to->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0)
+ continue;
+ break;
default:
/* only the above status values should be checked */
continue;
}
/* skip all but DELETED files unless copy detection is on */
- if (from->status != GIT_DELTA_DELETED &&
- !FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
+ if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES) &&
+ from->status != GIT_DELTA_DELETED &&
+ (from->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0)
continue;
/* don't check UNMODIFIED files as source unless given option */
@@ -463,34 +544,44 @@ int git_diff_find_similar(
/* calculate similarity and see if this pair beats the
* similarity score of the current best pair.
*/
- similarity = calc_similarity(NULL, &from->old_file, &to->new_file);
+ similarity = similarity_measure(
+ diff, &opts, cache, 2 * i, 2 * j + 1);
+
+ if (similarity < 0) {
+ error = similarity;
+ goto cleanup;
+ }
- if (to->similarity < similarity) {
- to->similarity = similarity;
- if (git_vector_set(NULL, &matches, j, from) < 0)
- return -1;
+ if (to->similarity < (unsigned int)similarity) {
+ to->similarity = (unsigned int)similarity;
+ matches[j] = i + 1;
}
}
}
/* next rewrite the diffs with renames / copies */
- num_changes = 0;
+ num_rewrites = 0;
git_vector_foreach(&diff->deltas, j, to) {
- from = GIT_VECTOR_GET(&matches, j);
- if (!from) {
+ if (!matches[j]) {
assert(to->similarity == 0);
continue;
}
- /* three possible outcomes here:
+ i = matches[j] - 1;
+ from = GIT_VECTOR_GET(&diff->deltas, i);
+ assert(from);
+
+ /* four possible outcomes here:
* 1. old DELETED and if over rename threshold,
* new becomes RENAMED and old goes away
- * 2. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and
+ * 2. old SPLIT and if over rename threshold,
+ * new becomes RENAMED and old becomes ADDED (clear SPLIT)
+ * 3. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and
* old is more similar to new than it is to itself, in which
* case, new becomes RENAMED and old becomed ADDED
- * 3. otherwise if over copy threshold, new becomes COPIED
+ * 4. otherwise if over copy threshold, new becomes COPIED
*/
if (from->status == GIT_DELTA_DELETED) {
@@ -503,7 +594,26 @@ int git_diff_find_similar(
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
from->flags |= GIT_DIFF_FLAG__TO_DELETE;
- num_changes++;
+ num_rewrites++;
+
+ continue;
+ }
+
+ if (from->status == GIT_DELTA_MODIFIED &&
+ (from->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
+ {
+ if (to->similarity < opts.rename_threshold) {
+ to->similarity = 0;
+ continue;
+ }
+
+ to->status = GIT_DELTA_RENAMED;
+ memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
+
+ from->status = GIT_DELTA_ADDED;
+ from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
+ memset(&from->old_file, 0, sizeof(from->old_file));
+ num_rewrites--;
continue;
}
@@ -512,10 +622,15 @@ int git_diff_find_similar(
FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
to->similarity > opts.rename_threshold)
{
- similarity = 100;
- /* calc_similarity(NULL, &from->old_file, from->new_file); */
+ similarity = similarity_measure(
+ diff, &opts, cache, 2 * i, 2 * i + 1);
+
+ if (similarity < 0) {
+ error = similarity;
+ goto cleanup;
+ }
- if (similarity < opts.rename_from_rewrite_threshold) {
+ if ((unsigned int)similarity < opts.rename_from_rewrite_threshold) {
to->status = GIT_DELTA_RENAMED;
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
@@ -538,17 +653,23 @@ int git_diff_find_similar(
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
}
- git_vector_free(&matches);
+ if (num_rewrites > 0) {
+ assert(num_rewrites < diff->deltas.length);
- if (num_changes > 0) {
- assert(num_changes < diff->deltas.length);
+ error = apply_splits_and_deletes(
+ diff, diff->deltas.length - num_rewrites);
+ }
+
+cleanup:
+ git__free(matches);
- if (apply_splits_and_deletes(
- diff, diff->deltas.length - num_changes) < 0)
- return -1;
+ for (i = 0; i < cache_size; ++i) {
+ if (cache[i] != NULL)
+ opts.metric->free_signature(cache[i], opts.metric->payload);
}
+ git__free(cache);
- return 0;
+ return error;
}
#undef FLAG_SET