Use a *real* built-in diff generator

This uses a simplified libxdiff setup to generate unified diffs _without_ doing fork/execve of GNU "diff". This has several huge advantages, for example: Before: [torvalds@g5 linux]$ time git diff v2.6.16.. > /dev/null real 0m24.818s user 0m13.332s sys 0m8.664s After: [torvalds@g5 linux]$ time git diff v2.6.16.. > /dev/null real 0m4.563s user 0m2.944s sys 0m1.580s and the fact that this should be a lot more portable (ie we can ignore all the issues with doing fork/execve under Windows). Perhaps even more importantly, this allows us to do diffs without actually ever writing out the git file contents to a temporary file (and without any of the shell quoting issues on filenames etc etc). NOTE! THIS PATCH DOES NOT DO THAT OPTIMIZATION YET! I was lazy, and the current "diff-core" code actually will always write the temp-files, because it used to be something that you simply had to do. So this current one actually writes a temp-file like before, and then reads it into memory again just to do the diff. Stupid. But if this basic infrastructure is accepted, we can start switching over diff-core to not write temp-files, which should speed things up even further, especially when doing big tree-to-tree diffs. Now, in the interest of full disclosure, I should also point out a few downsides: - the libxdiff algorithm is different, and I bet GNU diff has gotten a lot more testing. And the thing is, generating a diff is not an exact science - you can get two different diffs (and you will), and they can both be perfectly valid. So it's not possible to "validate" the libxdiff output by just comparing it against GNU diff. - GNU diff does some nice eye-candy, like trying to figure out what the last function was, and adding that information to the "@@ .." line. libxdiff doesn't do that. - The libxdiff thing has some known deficiencies. In particular, it gets the "\No newline at end of file" case wrong. So this is currently for the experimental branch only. I hope Davide will help fix it. That said, I think the huge performance advantage, and the fact that it integrates better is definitely worth it. But it should go into a development branch at least due to the missing newline issue. Technical note: this is based on libxdiff-0.17, but I did some surgery to get rid of the extraneous fat - stuff that git doesn't need, and seriously cutting down on mmfile_t, which had much more capabilities than the diff algorithm either needed or used. In this version, "mmfile_t" is just a trivial <pointer,length> tuple. That said, I tried to keep the differences to simple removals, so that you can do a diff between this and the libxdiff origin, and you'll basically see just things getting deleted. Even the mmfile_t simplifications are left in a state where the diffs should be readable. Apologies to Davide, whom I'd love to get feedback on this all from (I wrote my own "fill_mmfile()" for the new simpler mmfile_t format: the old complex format had a helper function for that, but I did my surgery with the goal in mind that eventually we _should_ just do mmfile_t mf; buf = read_sha1_file(sha1, type, &size); mf->ptr = buf; mf->size = size; .. use "mf" directly .. which was really a nightmare with the old "helpful" mmfile_t, and really is that easy with the new cut-down interfaces). [ Btw, as any hawk-eye can see from the diff, this was actually generated with itself, so it is "self-hosting". That's about all the testing it has gotten, along with the above kernel diff, which eye-balls correctly, but shows the newline issue when you double-check it with "git-apply" ] Signed-off-by: Linus Torvalds <torvalds@osdl.org> Signed-off-by: Junio C Hamano <junkio@cox.net>
author: Linus Torvalds <torvalds@osdl.org> 2006-03-25 07:13:22 +0300
committer: Junio C Hamano <junkio@cox.net> 2006-03-26 04:49:58 +0400
commit: 3443546f6ef57fe28ea5cca232df8e400bfc3883 (patch)
tree: 01b252a45c73d900a97fe305d696f386658e5d7d /xdiff/xdiffi.c
parent: c150462824008957f568ca7aa05a65b35d860eb9 (diff)
1 files changed, 469 insertions, 0 deletions
diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c
new file mode 100644
index 0000000000..8ea04837ec
--- /dev/null
+++ b/xdiff/xdiffi.c
@@ -0,0 +1,469 @@
+/*
+ *  LibXDiff by Davide Libenzi ( File Differential Library )
+ *  Copyright (C) 2003	Davide Libenzi
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include "xinclude.h"
+
+
+
+#define XDL_MAX_COST_MIN 256
+#define XDL_HEUR_MIN_COST 256
+#define XDL_LINE_MAX (long)((1UL << (8 * sizeof(long) - 1)) - 1)
+#define XDL_SNAKE_CNT 20
+#define XDL_K_HEUR 4
+
+
+
+typedef struct s_xdpsplit {
+	long i1, i2;
+	int min_lo, min_hi;
+} xdpsplit_t;
+
+
+
+
+static long xdl_split(unsigned long const *ha1, long off1, long lim1,
+		      unsigned long const *ha2, long off2, long lim2,
+		      long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
+		      xdalgoenv_t *xenv);
+static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2);
+
+
+
+
+/*
+ * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
+ * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
+ * the forward diagonal starting from (off1, off2) and the backward diagonal
+ * starting from (lim1, lim2). If the K values on the same diagonal crosses
+ * returns the furthest point of reach. We might end up having to expensive
+ * cases using this algorithm is full, so a little bit of heuristic is needed
+ * to cut the search and to return a suboptimal point.
+ */
+static long xdl_split(unsigned long const *ha1, long off1, long lim1,
+		      unsigned long const *ha2, long off2, long lim2,
+		      long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
+		      xdalgoenv_t *xenv) {
+	long dmin = off1 - lim2, dmax = lim1 - off2;
+	long fmid = off1 - off2, bmid = lim1 - lim2;
+	long odd = (fmid - bmid) & 1;
+	long fmin = fmid, fmax = fmid;
+	long bmin = bmid, bmax = bmid;
+	long ec, d, i1, i2, prev1, best, dd, v, k;
+
+	/*
+	 * Set initial diagonal values for both forward and backward path.
+	 */
+	kvdf[fmid] = off1;
+	kvdb[bmid] = lim1;
+
+	for (ec = 1;; ec++) {
+		int got_snake = 0;
+
+		/*
+		 * We need to extent the diagonal "domain" by one. If the next
+		 * values exits the box boundaries we need to change it in the
+		 * opposite direction because (max - min) must be a power of two.
+		 * Also we initialize the extenal K value to -1 so that we can
+		 * avoid extra conditions check inside the core loop.
+		 */
+		if (fmin > dmin)
+			kvdf[--fmin - 1] = -1;
+		else
+			++fmin;
+		if (fmax < dmax)
+			kvdf[++fmax + 1] = -1;
+		else
+			--fmax;
+
+		for (d = fmax; d >= fmin; d -= 2) {
+			if (kvdf[d - 1] >= kvdf[d + 1])
+				i1 = kvdf[d - 1] + 1;
+			else
+				i1 = kvdf[d + 1];
+			prev1 = i1;
+			i2 = i1 - d;
+			for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++);
+			if (i1 - prev1 > xenv->snake_cnt)
+				got_snake = 1;
+			kvdf[d] = i1;
+			if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
+				spl->i1 = i1;
+				spl->i2 = i2;
+				spl->min_lo = spl->min_hi = 1;
+				return ec;
+			}
+		}
+
+		/*
+		 * We need to extent the diagonal "domain" by one. If the next
+		 * values exits the box boundaries we need to change it in the
+		 * opposite direction because (max - min) must be a power of two.
+		 * Also we initialize the extenal K value to -1 so that we can
+		 * avoid extra conditions check inside the core loop.
+		 */
+		if (bmin > dmin)
+			kvdb[--bmin - 1] = XDL_LINE_MAX;
+		else
+			++bmin;
+		if (bmax < dmax)
+			kvdb[++bmax + 1] = XDL_LINE_MAX;
+		else
+			--bmax;
+
+		for (d = bmax; d >= bmin; d -= 2) {
+			if (kvdb[d - 1] < kvdb[d + 1])
+				i1 = kvdb[d - 1];
+			else
+				i1 = kvdb[d + 1] - 1;
+			prev1 = i1;
+			i2 = i1 - d;
+			for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--);
+			if (prev1 - i1 > xenv->snake_cnt)
+				got_snake = 1;
+			kvdb[d] = i1;
+			if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
+				spl->i1 = i1;
+				spl->i2 = i2;
+				spl->min_lo = spl->min_hi = 1;
+				return ec;
+			}
+		}
+
+		if (need_min)
+			continue;
+
+		/*
+		 * If the edit cost is above the heuristic trigger and if
+		 * we got a good snake, we sample current diagonals to see
+		 * if some of the, have reached an "interesting" path. Our
+		 * measure is a function of the distance from the diagonal
+		 * corner (i1 + i2) penalized with the distance from the
+		 * mid diagonal itself. If this value is above the current
+		 * edit cost times a magic factor (XDL_K_HEUR) we consider
+		 * it interesting.
+		 */
+		if (got_snake && ec > xenv->heur_min) {
+			for (best = 0, d = fmax; d >= fmin; d -= 2) {
+				dd = d > fmid ? d - fmid: fmid - d;
+				i1 = kvdf[d];
+				i2 = i1 - d;
+				v = (i1 - off1) + (i2 - off2) - dd;
+
+				if (v > XDL_K_HEUR * ec && v > best &&
+				    off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
+				    off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
+					for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++)
+						if (k == xenv->snake_cnt) {
+							best = v;
+							spl->i1 = i1;
+							spl->i2 = i2;
+							break;
+						}
+				}
+			}
+			if (best > 0) {
+				spl->min_lo = 1;
+				spl->min_hi = 0;
+				return ec;
+			}
+
+			for (best = 0, d = bmax; d >= bmin; d -= 2) {
+				dd = d > bmid ? d - bmid: bmid - d;
+				i1 = kvdb[d];
+				i2 = i1 - d;
+				v = (lim1 - i1) + (lim2 - i2) - dd;
+
+				if (v > XDL_K_HEUR * ec && v > best &&
+				    off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
+				    off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
+					for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++)
+						if (k == xenv->snake_cnt - 1) {
+							best = v;
+							spl->i1 = i1;
+							spl->i2 = i2;
+							break;
+						}
+				}
+			}
+			if (best > 0) {
+				spl->min_lo = 0;
+				spl->min_hi = 1;
+				return ec;
+			}
+		}
+
+		/*
+		 * Enough is enough. We spent too much time here and now we collect
+		 * the furthest reaching path using the (i1 + i2) measure.
+		 */
+		if (ec >= xenv->mxcost) {
+			long fbest, fbest1, bbest, bbest1;
+
+			fbest = -1;
+			for (d = fmax; d >= fmin; d -= 2) {
+				i1 = XDL_MIN(kvdf[d], lim1);
+				i2 = i1 - d;
+				if (lim2 < i2)
+					i1 = lim2 + d, i2 = lim2;
+				if (fbest < i1 + i2) {
+					fbest = i1 + i2;
+					fbest1 = i1;
+				}
+			}
+
+			bbest = XDL_LINE_MAX;
+			for (d = bmax; d >= bmin; d -= 2) {
+				i1 = XDL_MAX(off1, kvdb[d]);
+				i2 = i1 - d;
+				if (i2 < off2)
+					i1 = off2 + d, i2 = off2;
+				if (i1 + i2 < bbest) {
+					bbest = i1 + i2;
+					bbest1 = i1;
+				}
+			}
+
+			if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
+				spl->i1 = fbest1;
+				spl->i2 = fbest - fbest1;
+				spl->min_lo = 1;
+				spl->min_hi = 0;
+			} else {
+				spl->i1 = bbest1;
+				spl->i2 = bbest - bbest1;
+				spl->min_lo = 0;
+				spl->min_hi = 1;
+			}
+			return ec;
+		}
+	}
+
+	return -1;
+}
+
+
+/*
+ * Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling
+ * the box splitting function. Note that the real job (marking changed lines)
+ * is done in the two boundary reaching checks.
+ */
+int xdl_recs_cmp(diffdata_t *dd1, long off1, long lim1,
+		 diffdata_t *dd2, long off2, long lim2,
+		 long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) {
+	unsigned long const *ha1 = dd1->ha, *ha2 = dd2->ha;
+
+	/*
+	 * Shrink the box by walking through each diagonal snake (SW and NE).
+	 */
+	for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++);
+	for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--);
+
+	/*
+	 * If one dimension is empty, then all records on the other one must
+	 * be obviously changed.
+	 */
+	if (off1 == lim1) {
+		char *rchg2 = dd2->rchg;
+		long *rindex2 = dd2->rindex;
+
+		for (; off2 < lim2; off2++)
+			rchg2[rindex2[off2]] = 1;
+	} else if (off2 == lim2) {
+		char *rchg1 = dd1->rchg;
+		long *rindex1 = dd1->rindex;
+
+		for (; off1 < lim1; off1++)
+			rchg1[rindex1[off1]] = 1;
+	} else {
+		long ec;
+		xdpsplit_t spl;
+
+		/*
+		 * Divide ...
+		 */
+		if ((ec = xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb,
+				    need_min, &spl, xenv)) < 0) {
+
+			return -1;
+		}
+
+		/*
+		 * ... et Impera.
+		 */
+		if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2,
+				 kvdf, kvdb, spl.min_lo, xenv) < 0 ||
+		    xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2,
+				 kvdf, kvdb, spl.min_hi, xenv) < 0) {
+
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+
+int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
+		xdfenv_t *xe) {
+	long ndiags;
+	long *kvd, *kvdf, *kvdb;
+	xdalgoenv_t xenv;
+	diffdata_t dd1, dd2;
+
+	if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
+
+		return -1;
+	}
+
+	/*
+	 * Allocate and setup K vectors to be used by the differential algorithm.
+	 * One is to store the forward path and one to store the backward path.
+	 */
+	ndiags = xe->xdf1.nreff + xe->xdf2.nreff + 3;
+	if (!(kvd = (long *) xdl_malloc((2 * ndiags + 2) * sizeof(long)))) {
+
+		xdl_free_env(xe);
+		return -1;
+	}
+	kvdf = kvd;
+	kvdb = kvdf + ndiags;
+	kvdf += xe->xdf2.nreff + 1;
+	kvdb += xe->xdf2.nreff + 1;
+
+	/*
+	 * Classical integer square root approximation using shifts.
+	 */
+	xenv.mxcost = 1;
+	for (; ndiags; ndiags >>= 2)
+		xenv.mxcost <<= 1;
+	if (xenv.mxcost < XDL_MAX_COST_MIN)
+		xenv.mxcost = XDL_MAX_COST_MIN;
+	xenv.snake_cnt = XDL_SNAKE_CNT;
+	xenv.heur_min = XDL_HEUR_MIN_COST;
+
+	dd1.nrec = xe->xdf1.nreff;
+	dd1.ha = xe->xdf1.ha;
+	dd1.rchg = xe->xdf1.rchg;
+	dd1.rindex = xe->xdf1.rindex;
+	dd2.nrec = xe->xdf2.nreff;
+	dd2.ha = xe->xdf2.ha;
+	dd2.rchg = xe->xdf2.rchg;
+	dd2.rindex = xe->xdf2.rindex;
+
+	if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec,
+			 kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) {
+
+		xdl_free(kvd);
+		xdl_free_env(xe);
+		return -1;
+	}
+
+	xdl_free(kvd);
+
+	return 0;
+}
+
+
+static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) {
+	xdchange_t *xch;
+
+	if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
+		return NULL;
+
+	xch->next = xscr;
+	xch->i1 = i1;
+	xch->i2 = i2;
+	xch->chg1 = chg1;
+	xch->chg2 = chg2;
+
+	return xch;
+}
+
+
+int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) {
+	xdchange_t *cscr = NULL, *xch;
+	char *rchg1 = xe->xdf1.rchg, *rchg2 = xe->xdf2.rchg;
+	long i1, i2, l1, l2;
+
+	/*
+	 * Trivial. Collects "groups" of changes and creates an edit script.
+	 */
+	for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--)
+		if (rchg1[i1 - 1] || rchg2[i2 - 1]) {
+			for (l1 = i1; rchg1[i1 - 1]; i1--);
+			for (l2 = i2; rchg2[i2 - 1]; i2--);
+
+			if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
+				xdl_free_script(cscr);
+				return -1;
+			}
+			cscr = xch;
+		}
+
+	*xscr = cscr;
+
+	return 0;
+}
+
+
+void xdl_free_script(xdchange_t *xscr) {
+	xdchange_t *xch;
+
+	while ((xch = xscr) != NULL) {
+		xscr = xscr->next;
+		xdl_free(xch);
+	}
+}
+
+
+int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
+	     xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
+	xdchange_t *xscr;
+	xdfenv_t xe;
+
+	if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
+
+		return -1;
+	}
+
+	if (xdl_build_script(&xe, &xscr) < 0) {
+
+		xdl_free_env(&xe);
+		return -1;
+	}
+
+	if (xscr) {
+		if (xdl_emit_diff(&xe, xscr, ecb, xecfg) < 0) {
+
+			xdl_free_script(xscr);
+			xdl_free_env(&xe);
+			return -1;
+		}
+
+		xdl_free_script(xscr);
+	}
+
+	xdl_free_env(&xe);
+
+	return 0;
+}
+
author	Linus Torvalds <torvalds@osdl.org>	2006-03-25 07:13:22 +0300
committer	Junio C Hamano <junkio@cox.net>	2006-03-26 04:49:58 +0400
commit	3443546f6ef57fe28ea5cca232df8e400bfc3883 (patch)
tree	01b252a45c73d900a97fe305d696f386658e5d7d /xdiff/xdiffi.c
parent	c150462824008957f568ca7aa05a65b35d860eb9 (diff)