speedup for fast gauss blue (approx 10% - 15%)

- get the image width and height once rather then calculating on every access (was doing min/max subtract). - use unsigned int's - faster for looping.
author: Campbell Barton <ideasman42@gmail.com> 2012-06-16 13:52:38 +0400
committer: Campbell Barton <ideasman42@gmail.com> 2012-06-16 13:52:38 +0400
commit: 2f29f8d18656e9c8796b68671a60812d0cffcb70 (patch)
tree: 73418dba2888b792df0c272699fe391a2bb9062b /source/blender/nodes/composite/node_composite_util.c
parent: 250e919b7c1fa3c70925c87d625fa5e0f2d298ab (diff)
1 files changed, 68 insertions, 61 deletions
diff --git a/source/blender/nodes/composite/node_composite_util.c b/source/blender/nodes/composite/node_composite_util.c
index afd10d96e99..70788dfe0c8 100644
--- a/source/blender/nodes/composite/node_composite_util.c
+++ b/source/blender/nodes/composite/node_composite_util.c
@@ -32,6 +32,8 @@
 
 #include "node_composite_util.h"
 
+#include <limits.h>
+
 CompBuf *alloc_compbuf(int sizex, int sizey, int type, int alloc)
 {
 	CompBuf *cbuf= MEM_callocN(sizeof(CompBuf), "compbuf");
@@ -1300,33 +1302,35 @@ void IIR_gauss(CompBuf* src, float sigma, int chan, int xy)
 {
 	double q, q2, sc, cf[4], tsM[9], tsu[3], tsv[3];
 	double *X, *Y, *W;
-	int i, x, y, sz;
+	const unsigned int src_width = src->x;
+	const unsigned int src_height = src->y;
+	unsigned int i, x, y, sz;
 
 	// <0.5 not valid, though can have a possibly useful sort of sharpening effect
 	if (sigma < 0.5f) return;
-	
+
 	if ((xy < 1) || (xy > 3)) xy = 3;
-	
+
 	// XXX The YVV macro defined below explicitly expects sources of at least 3x3 pixels,
 	//     so just skiping blur along faulty direction if src's def is below that limit!
-	if (src->x < 3) xy &= ~(int) 1;
-	if (src->y < 3) xy &= ~(int) 2;
+	if (src_width < 3) xy &= ~(int) 1;
+	if (src_height < 3) xy &= ~(int) 2;
 	if (xy < 1) return;
 
 	// see "Recursive Gabor Filtering" by Young/VanVliet
 	// all factors here in double.prec. Required, because for single.prec it seems to blow up if sigma > ~200
 	if (sigma >= 3.556f)
-		q = 0.9804f*(sigma - 3.556f) + 2.5091f;
-	else // sigma >= 0.5
-		q = (0.0561f*sigma + 0.5784f)*sigma - 0.2568f;
-	q2 = q*q;
-	sc = (1.1668 + q)*(3.203729649  + (2.21566 + q)*q);
+		q = 0.9804f * (sigma - 3.556f) + 2.5091f;
+	else     // sigma >= 0.5
+		q = (0.0561f * sigma + 0.5784f) * sigma - 0.2568f;
+	q2 = q * q;
+	sc = (1.1668 + q) * (3.203729649  + (2.21566 + q) * q);
 	// no gabor filtering here, so no complex multiplies, just the regular coefs.
 	// all negated here, so as not to have to recalc Triggs/Sdika matrix
-	cf[1] = q*(5.788961737 + (6.76492 + 3.0*q)*q)/ sc;
-	cf[2] = -q2*(3.38246 + 3.0*q)/sc;
+	cf[1] = q * (5.788961737 + (6.76492 + 3.0 * q) * q) / sc;
+	cf[2] = -q2 * (3.38246 + 3.0 * q) / sc;
 	// 0 & 3 unchanged
-	cf[3] = q2*q/sc;
+	cf[3] = q2 * q / sc;
 	cf[0] = 1.0 - cf[1] - cf[2] - cf[3];
 
 	// Triggs/Sdika border corrections,
@@ -1336,59 +1340,62 @@ void IIR_gauss(CompBuf* src, float sigma, int chan, int xy)
 	// but neither seem to be quite the same, result seems to be ok so far anyway.
 	// Extra scale factor here to not have to do it in filter,
 	// though maybe this had something to with the precision errors
-	sc = cf[0]/((1.0 + cf[1] - cf[2] + cf[3])*(1.0 - cf[1] - cf[2] - cf[3])*(1.0 + cf[2] + (cf[1] - cf[3])*cf[3]));
-	tsM[0] = sc*(-cf[3]*cf[1] + 1.0 - cf[3]*cf[3] - cf[2]);
-	tsM[1] = sc*((cf[3] + cf[1])*(cf[2] + cf[3]*cf[1]));
-	tsM[2] = sc*(cf[3]*(cf[1] + cf[3]*cf[2]));
-	tsM[3] = sc*(cf[1] + cf[3]*cf[2]);
-	tsM[4] = sc*(-(cf[2] - 1.0)*(cf[2] + cf[3]*cf[1]));
-	tsM[5] = sc*(-(cf[3]*cf[1] + cf[3]*cf[3] + cf[2] - 1.0)*cf[3]);
-	tsM[6] = sc*(cf[3]*cf[1] + cf[2] + cf[1]*cf[1] - cf[2]*cf[2]);
-	tsM[7] = sc*(cf[1]*cf[2] + cf[3]*cf[2]*cf[2] - cf[1]*cf[3]*cf[3] - cf[3]*cf[3]*cf[3] - cf[3]*cf[2] + cf[3]);
-	tsM[8] = sc*(cf[3]*(cf[1] + cf[3]*cf[2]));
-
-#define YVV(L)                                                                \
-{                                                                             \
-	W[0] = cf[0]*X[0] + cf[1]*X[0] + cf[2]*X[0] + cf[3]*X[0];                 \
-	W[1] = cf[0]*X[1] + cf[1]*W[0] + cf[2]*X[0] + cf[3]*X[0];                 \
-	W[2] = cf[0]*X[2] + cf[1]*W[1] + cf[2]*W[0] + cf[3]*X[0];                 \
-	for (i=3; i<L; i++)                                                       \
-		W[i] = cf[0]*X[i] + cf[1]*W[i-1] + cf[2]*W[i-2] + cf[3]*W[i-3];       \
-	tsu[0] = W[L-1] - X[L-1];                                                 \
-	tsu[1] = W[L-2] - X[L-1];                                                 \
-	tsu[2] = W[L-3] - X[L-1];                                                 \
-	tsv[0] = tsM[0]*tsu[0] + tsM[1]*tsu[1] + tsM[2]*tsu[2] + X[L-1];          \
-	tsv[1] = tsM[3]*tsu[0] + tsM[4]*tsu[1] + tsM[5]*tsu[2] + X[L-1];          \
-	tsv[2] = tsM[6]*tsu[0] + tsM[7]*tsu[1] + tsM[8]*tsu[2] + X[L-1];          \
-	Y[L-1] = cf[0]*W[L-1] + cf[1]*tsv[0] + cf[2]*tsv[1] + cf[3]*tsv[2];       \
-	Y[L-2] = cf[0]*W[L-2] + cf[1]*Y[L-1] + cf[2]*tsv[0] + cf[3]*tsv[1];       \
-	Y[L-3] = cf[0]*W[L-3] + cf[1]*Y[L-2] + cf[2]*Y[L-1] + cf[3]*tsv[0];       \
-	for (i=L-4; i>=0; i--)                                                    \
-		Y[i] = cf[0]*W[i] + cf[1]*Y[i+1] + cf[2]*Y[i+2] + cf[3]*Y[i+3];       \
+	sc = cf[0] / ((1.0 + cf[1] - cf[2] + cf[3]) * (1.0 - cf[1] - cf[2] - cf[3]) * (1.0 + cf[2] + (cf[1] - cf[3]) * cf[3]));
+	tsM[0] = sc * (-cf[3] * cf[1] + 1.0 - cf[3] * cf[3] - cf[2]);
+	tsM[1] = sc * ((cf[3] + cf[1]) * (cf[2] + cf[3] * cf[1]));
+	tsM[2] = sc * (cf[3] * (cf[1] + cf[3] * cf[2]));
+	tsM[3] = sc * (cf[1] + cf[3] * cf[2]);
+	tsM[4] = sc * (-(cf[2] - 1.0) * (cf[2] + cf[3] * cf[1]));
+	tsM[5] = sc * (-(cf[3] * cf[1] + cf[3] * cf[3] + cf[2] - 1.0) * cf[3]);
+	tsM[6] = sc * (cf[3] * cf[1] + cf[2] + cf[1] * cf[1] - cf[2] * cf[2]);
+	tsM[7] = sc * (cf[1] * cf[2] + cf[3] * cf[2] * cf[2] - cf[1] * cf[3] * cf[3] - cf[3] * cf[3] * cf[3] - cf[3] * cf[2] + cf[3]);
+	tsM[8] = sc * (cf[3] * (cf[1] + cf[3] * cf[2]));
+
+#define YVV(L)                                                                          \
+{                                                                                       \
+	W[0] = cf[0] * X[0] + cf[1] * X[0] + cf[2] * X[0] + cf[3] * X[0];                   \
+	W[1] = cf[0] * X[1] + cf[1] * W[0] + cf[2] * X[0] + cf[3] * X[0];                   \
+	W[2] = cf[0] * X[2] + cf[1] * W[1] + cf[2] * W[0] + cf[3] * X[0];                   \
+	for (i = 3; i < L; i++) {                                                           \
+		W[i] = cf[0] * X[i] + cf[1] * W[i - 1] + cf[2] * W[i - 2] + cf[3] * W[i - 3];   \
+	}                                                                                   \
+	tsu[0] = W[L - 1] - X[L - 1];                                                       \
+	tsu[1] = W[L - 2] - X[L - 1];                                                       \
+	tsu[2] = W[L - 3] - X[L - 1];                                                       \
+	tsv[0] = tsM[0] * tsu[0] + tsM[1] * tsu[1] + tsM[2] * tsu[2] + X[L - 1];            \
+	tsv[1] = tsM[3] * tsu[0] + tsM[4] * tsu[1] + tsM[5] * tsu[2] + X[L - 1];            \
+	tsv[2] = tsM[6] * tsu[0] + tsM[7] * tsu[1] + tsM[8] * tsu[2] + X[L - 1];            \
+	Y[L - 1] = cf[0] * W[L - 1] + cf[1] * tsv[0] + cf[2] * tsv[1] + cf[3] * tsv[2];     \
+	Y[L - 2] = cf[0] * W[L - 2] + cf[1] * Y[L - 1] + cf[2] * tsv[0] + cf[3] * tsv[1];   \
+	Y[L - 3] = cf[0] * W[L - 3] + cf[1] * Y[L - 2] + cf[2] * Y[L - 1] + cf[3] * tsv[0]; \
+	/* 'i != UINT_MAX' is really 'i >= 0', but necessary for unsigned int wrapping */   \
+	for (i = L - 4; i != UINT_MAX; i--) {                                               \
+		Y[i] = cf[0] * W[i] + cf[1] * Y[i + 1] + cf[2] * Y[i + 2] + cf[3] * Y[i + 3];   \
+	}                                                                                   \
 } (void)0
 
 	// intermediate buffers
-	sz = MAX2(src->x, src->y);
-	X = MEM_callocN(sz*sizeof(double), "IIR_gauss X buf");
-	Y = MEM_callocN(sz*sizeof(double), "IIR_gauss Y buf");
-	W = MEM_callocN(sz*sizeof(double), "IIR_gauss W buf");
-	if (xy & 1) {	// H
-		for (y=0; y<src->y; ++y) {
-			const int yx = y*src->x;
-			for (x=0; x<src->x; ++x)
-				X[x] = src->rect[(x + yx)*src->type + chan];
-			YVV(src->x);
-			for (x=0; x<src->x; ++x)
-				src->rect[(x + yx)*src->type + chan] = Y[x];
+	sz = MAX2(src_width, src_height);
+	X = MEM_callocN(sz * sizeof(double), "IIR_gauss X buf");
+	Y = MEM_callocN(sz * sizeof(double), "IIR_gauss Y buf");
+	W = MEM_callocN(sz * sizeof(double), "IIR_gauss W buf");
+	if (xy & 1) {       // H
+		for (y = 0; y < src_height; ++y) {
+			const int yx = y * src_width;
+			for (x = 0; x < src_width; ++x)
+				X[x] = src->rect[(x + yx) * src->type + chan];
+			YVV(src_width);
+			for (x = 0; x < src_width; ++x)
+				src->rect[(x + yx) * src->type + chan] = Y[x];
 		}
 	}
-	if (xy & 2) {	// V
-		for (x=0; x<src->x; ++x) {
-			for (y=0; y<src->y; ++y)
-				X[y] = src->rect[(x + y*src->x)*src->type + chan];
-			YVV(src->y);
-			for (y=0; y<src->y; ++y)
-				src->rect[(x + y*src->x)*src->type + chan] = Y[y];
+	if (xy & 2) {       // V
+		for (x = 0; x < src_width; ++x) {
+			for (y = 0; y < src_height; ++y)
+				X[y] = src->rect[(x + y * src_width) * src->type + chan];
+			YVV(src_height);
+			for (y = 0; y < src_height; ++y)
+				src->rect[(x + y * src_width) * src->type + chan] = Y[y];
 		}
 	}
author	Campbell Barton <ideasman42@gmail.com>	2012-06-16 13:52:38 +0400
committer	Campbell Barton <ideasman42@gmail.com>	2012-06-16 13:52:38 +0400
commit	2f29f8d18656e9c8796b68671a60812d0cffcb70 (patch)
tree	73418dba2888b792df0c272699fe391a2bb9062b /source/blender/nodes/composite/node_composite_util.c
parent	250e919b7c1fa3c70925c87d625fa5e0f2d298ab (diff)