Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/libgit2.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRussell Belfer <rb@github.com>2013-05-21 00:37:21 +0400
committerRussell Belfer <rb@github.com>2013-05-21 00:37:21 +0400
commit9be5be47fb1d9bc08e25b30c05dbf48739710062 (patch)
tree7543170535a1260f24c27696c361e58ae19de17c
parent5c8f37a397609eba92c6a906be467a7d7373e4f3 (diff)
More git_diff_find_similar improvements
- Add new GIT_DIFF_FIND_EXACT_MATCH_ONLY flag to do similarity matching without using the similarity metric (i.e. only compare the SHA). - Clean up the similarity measurement code to more rigorously distinguish between files that are not similar and files that are not comparable (previously, a 0 could either mean that the files could not be compared or that they were totally different) - When splitting a MODIFIED file into a DELETE/ADD pair, actually make a DELETED/UNTRACKED pair if the right side of the diff is from the working directory. This prevents an odd mix of ADDED and UNTRACKED files on workdir diffs.
-rw-r--r--include/git2/diff.h2
-rw-r--r--src/diff_tform.c169
-rw-r--r--src/fileops.h1
3 files changed, 101 insertions, 71 deletions
diff --git a/include/git2/diff.h b/include/git2/diff.h
index 172aa118b..31f6e0591 100644
--- a/include/git2/diff.h
+++ b/include/git2/diff.h
@@ -441,6 +441,8 @@ typedef enum {
GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 12),
/** measure similarity including all data */
GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 13),
+ /** measure similarity only by comparing SHAs (fast and cheap) */
+ GIT_DIFF_FIND_EXACT_MATCH_ONLY = (1 << 14),
} git_diff_find_t;
/**
diff --git a/src/diff_tform.c b/src/diff_tform.c
index 33268e403..d5e56ac60 100644
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -255,6 +255,16 @@ static int normalize_find_opts(
/* some flags imply others */
+ if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
+ /* if we are only looking for exact matches, then don't turn
+ * MODIFIED items into ADD/DELETE pairs because it's too picky
+ */
+ opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
+
+ /* similarly, don't look for self-rewrites to split */
+ opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
+ }
+
if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
opts->flags |= GIT_DIFF_FIND_RENAMES;
@@ -373,7 +383,10 @@ static int apply_splits_and_deletes(
if (git_vector_insert(&onto, deleted) < 0)
goto on_error;
- delta->status = GIT_DELTA_ADDED;
+ if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
+ delta->status = GIT_DELTA_UNTRACKED;
+ else
+ delta->status = GIT_DELTA_ADDED;
memset(&delta->old_file, 0, sizeof(delta->old_file));
delta->old_file.path = delta->new_file.path;
delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
@@ -460,22 +473,56 @@ static int similarity_calc(
return error;
}
+#define FLAG_SET(opts,flag_name) (((opts).flags & flag_name) != 0)
+
+/* - score < 0 means files cannot be compared
+ * - score >= 100 means files are exact match
+ * - score == 0 means files are completely different
+ */
static int similarity_measure(
+ int *score,
git_diff_list *diff,
git_diff_find_options *opts,
void **cache,
size_t a_idx,
size_t b_idx)
{
- int score = 0;
git_diff_file *a_file = similarity_get_file(diff, a_idx);
git_diff_file *b_file = similarity_get_file(diff, b_idx);
+ bool exact_match = FLAG_SET(*opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
+
+ *score = -1;
+ /* don't try to compare files of different types */
if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
return 0;
- if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0)
- return 100;
+ /* if exact match is requested, force calculation of missing OIDs */
+ if (exact_match) {
+ if (git_oid_iszero(&a_file->oid) &&
+ diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
+ !git_diff__oid_for_file(diff->repo, a_file->path,
+ a_file->mode, a_file->size, &a_file->oid))
+ a_file->flags |= GIT_DIFF_FLAG_VALID_OID;
+
+ if (git_oid_iszero(&b_file->oid) &&
+ diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
+ !git_diff__oid_for_file(diff->repo, b_file->path,
+ b_file->mode, b_file->size, &b_file->oid))
+ b_file->flags |= GIT_DIFF_FLAG_VALID_OID;
+ }
+
+ /* check OID match as a quick test */
+ if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0) {
+ *score = 100;
+ return 0;
+ }
+
+ /* don't calculate signatures if we are doing exact match */
+ if (exact_match) {
+ *score = 0;
+ return 0;
+ }
/* update signature cache if needed */
if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0)
@@ -488,20 +535,33 @@ static int similarity_measure(
return 0;
/* compare signatures */
- if (opts->metric->similarity(
- &score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
- return -1;
-
- /* clip score */
- if (score < 1)
- score = 1; /* zero means uncomparable, so use 1 for least similar */
- else if (score > 100)
- score = 100;
-
- return score;
+ return opts->metric->similarity(
+ score, cache[a_idx], cache[b_idx], opts->metric->payload);
}
-#define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0)
+static void convert_to_rename_and_add(
+ git_diff_list *diff,
+ git_diff_delta *from,
+ git_diff_delta *to,
+ int similarity)
+{
+ to->status = GIT_DELTA_RENAMED;
+ to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
+ to->similarity = (uint32_t)similarity;
+ memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
+ validate_delta(to);
+
+ if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
+ from->status = GIT_DELTA_UNTRACKED;
+ else
+ from->status = GIT_DELTA_ADDED;
+ from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
+ from->similarity = 0;
+ memset(&from->old_file, 0, sizeof(from->old_file));
+ from->old_file.path = from->new_file.path;
+ from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
+ validate_delta(from);
+}
typedef struct {
uint32_t idx;
@@ -542,21 +602,17 @@ int git_diff_find_similar(
continue;
/* skip things that aren't plain blobs */
- if (GIT_MODE_TYPE(from->old_file.mode) !=
- GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+ if (!GIT_MODE_ISBLOB(from->old_file.mode))
continue;
/* measure similarity from old_file to new_file */
- similarity = similarity_measure(
- diff, &opts, cache, 2 * i, 2 * i + 1);
-
- if (similarity < 0) {
- error = similarity;
+ if ((error = similarity_measure(
+ &similarity, diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
goto cleanup;
- }
- if (similarity > 0 &&
- similarity < (int)opts.break_rewrite_threshold) {
+ if (similarity < 0)
+ continue;
+ if (similarity < (int)opts.break_rewrite_threshold) {
from->similarity = (uint32_t)similarity;
from->flags |= GIT_DIFF_FLAG__TO_SPLIT;
num_rewrites++;
@@ -573,8 +629,7 @@ int git_diff_find_similar(
matches[i].similarity = 0;
/* skip things that aren't plain blobs */
- if (GIT_MODE_TYPE(from->old_file.mode) !=
- GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+ if (!GIT_MODE_ISBLOB(from->old_file.mode))
continue;
/* don't check UNMODIFIED files as source unless given option */
@@ -599,8 +654,7 @@ int git_diff_find_similar(
continue;
/* skip things that aren't blobs */
- if (GIT_MODE_TYPE(to->new_file.mode) !=
- GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+ if (!GIT_MODE_ISBLOB(to->new_file.mode))
continue;
/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
@@ -630,14 +684,13 @@ int git_diff_find_similar(
break;
/* calculate similarity for this pair and find best match */
- similarity = similarity_measure(
- diff, &opts, cache, 2 * i, 2 * j + 1);
-
- if (similarity < 0) {
- error = similarity;
+ if ((error = similarity_measure(
+ &similarity, diff, &opts, cache, 2 * i, 2 * j + 1)) < 0)
goto cleanup;
+ if (similarity < 0) {
+ --tried_targets;
+ continue;
}
-
if (matches[i].similarity < (uint32_t)similarity) {
matches[i].similarity = (uint32_t)similarity;
matches[i].idx = j;
@@ -687,18 +740,7 @@ int git_diff_find_similar(
if (similarity < (int)opts.rename_threshold)
continue;
- to->status = GIT_DELTA_RENAMED;
- to->similarity = (uint32_t)similarity;
- memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
- validate_delta(to);
-
- from->status = GIT_DELTA_ADDED;
- from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
- from->similarity = 0; /* reset self-similarity */
- memset(&from->old_file, 0, sizeof(from->old_file));
- from->old_file.path = from->new_file.path;
- validate_delta(from);
-
+ convert_to_rename_and_add(diff, from, to, similarity);
num_rewrites--;
num_updates++;
continue;
@@ -712,28 +754,16 @@ int git_diff_find_similar(
FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
similarity > (int)opts.rename_threshold)
{
- int self_similarity = similarity_measure(
- diff, &opts, cache, 2 * i, 2 * i + 1);
- if (self_similarity < 0) {
- error = self_similarity;
+ int self_similarity;
+
+ if ((error = similarity_measure(&self_similarity,
+ diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
goto cleanup;
- }
- if (self_similarity < (int)opts.rename_from_rewrite_threshold) {
- to->status = GIT_DELTA_RENAMED;
- to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
- to->similarity = (uint32_t)similarity;
- memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
- validate_delta(to);
-
- from->status = GIT_DELTA_ADDED;
- from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
- from->similarity = 0;
- memset(&from->old_file, 0, sizeof(from->old_file));
- from->old_file.path = from->new_file.path;
- from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
- validate_delta(from);
+ if (self_similarity >= 0 &&
+ self_similarity < (int)opts.rename_from_rewrite_threshold) {
+ convert_to_rename_and_add(diff, from, to, similarity);
num_updates++;
continue;
}
@@ -754,13 +784,10 @@ int git_diff_find_similar(
num_updates++;
}
- if (num_rewrites > 0) {
- assert(num_rewrites < diff->deltas.length);
-
+ if (num_rewrites > 0)
error = apply_splits_and_deletes(
diff, diff->deltas.length - num_rewrites,
FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES));
- }
if (num_rewrites > 0 || num_updates > 0)
git_vector_sort(&diff->deltas);
diff --git a/src/fileops.h b/src/fileops.h
index 627a6923d..3e214aab1 100644
--- a/src/fileops.h
+++ b/src/fileops.h
@@ -223,6 +223,7 @@ extern git_off_t git_futils_filesize(git_file fd);
#define GIT_MODE_PERMS_MASK 0777
#define GIT_CANONICAL_PERMS(MODE) (((MODE) & 0100) ? 0755 : 0644)
#define GIT_MODE_TYPE(MODE) ((MODE) & ~GIT_MODE_PERMS_MASK)
+#define GIT_MODE_ISBLOB(MODE) (GIT_MODE_TYPE(MODE) == GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
/**
* Convert a mode_t from the OS to a legal git mode_t value.