diff options
author | Campbell Barton <ideasman42@gmail.com> | 2012-06-16 13:52:38 +0400 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2012-06-16 13:52:38 +0400 |
commit | 2f29f8d18656e9c8796b68671a60812d0cffcb70 (patch) | |
tree | 73418dba2888b792df0c272699fe391a2bb9062b /source/blender/nodes/composite/node_composite_util.c | |
parent | 250e919b7c1fa3c70925c87d625fa5e0f2d298ab (diff) |
speedup for fast gauss blue (approx 10% - 15%)
- get the image width and height once rather then calculating on every access (was doing min/max subtract).
- use unsigned int's - faster for looping.
Diffstat (limited to 'source/blender/nodes/composite/node_composite_util.c')
-rw-r--r-- | source/blender/nodes/composite/node_composite_util.c | 129 |
1 files changed, 68 insertions, 61 deletions
diff --git a/source/blender/nodes/composite/node_composite_util.c b/source/blender/nodes/composite/node_composite_util.c index afd10d96e99..70788dfe0c8 100644 --- a/source/blender/nodes/composite/node_composite_util.c +++ b/source/blender/nodes/composite/node_composite_util.c @@ -32,6 +32,8 @@ #include "node_composite_util.h" +#include <limits.h> + CompBuf *alloc_compbuf(int sizex, int sizey, int type, int alloc) { CompBuf *cbuf= MEM_callocN(sizeof(CompBuf), "compbuf"); @@ -1300,33 +1302,35 @@ void IIR_gauss(CompBuf* src, float sigma, int chan, int xy) { double q, q2, sc, cf[4], tsM[9], tsu[3], tsv[3]; double *X, *Y, *W; - int i, x, y, sz; + const unsigned int src_width = src->x; + const unsigned int src_height = src->y; + unsigned int i, x, y, sz; // <0.5 not valid, though can have a possibly useful sort of sharpening effect if (sigma < 0.5f) return; - + if ((xy < 1) || (xy > 3)) xy = 3; - + // XXX The YVV macro defined below explicitly expects sources of at least 3x3 pixels, // so just skiping blur along faulty direction if src's def is below that limit! - if (src->x < 3) xy &= ~(int) 1; - if (src->y < 3) xy &= ~(int) 2; + if (src_width < 3) xy &= ~(int) 1; + if (src_height < 3) xy &= ~(int) 2; if (xy < 1) return; // see "Recursive Gabor Filtering" by Young/VanVliet // all factors here in double.prec. Required, because for single.prec it seems to blow up if sigma > ~200 if (sigma >= 3.556f) - q = 0.9804f*(sigma - 3.556f) + 2.5091f; - else // sigma >= 0.5 - q = (0.0561f*sigma + 0.5784f)*sigma - 0.2568f; - q2 = q*q; - sc = (1.1668 + q)*(3.203729649 + (2.21566 + q)*q); + q = 0.9804f * (sigma - 3.556f) + 2.5091f; + else // sigma >= 0.5 + q = (0.0561f * sigma + 0.5784f) * sigma - 0.2568f; + q2 = q * q; + sc = (1.1668 + q) * (3.203729649 + (2.21566 + q) * q); // no gabor filtering here, so no complex multiplies, just the regular coefs. // all negated here, so as not to have to recalc Triggs/Sdika matrix - cf[1] = q*(5.788961737 + (6.76492 + 3.0*q)*q)/ sc; - cf[2] = -q2*(3.38246 + 3.0*q)/sc; + cf[1] = q * (5.788961737 + (6.76492 + 3.0 * q) * q) / sc; + cf[2] = -q2 * (3.38246 + 3.0 * q) / sc; // 0 & 3 unchanged - cf[3] = q2*q/sc; + cf[3] = q2 * q / sc; cf[0] = 1.0 - cf[1] - cf[2] - cf[3]; // Triggs/Sdika border corrections, @@ -1336,59 +1340,62 @@ void IIR_gauss(CompBuf* src, float sigma, int chan, int xy) // but neither seem to be quite the same, result seems to be ok so far anyway. // Extra scale factor here to not have to do it in filter, // though maybe this had something to with the precision errors - sc = cf[0]/((1.0 + cf[1] - cf[2] + cf[3])*(1.0 - cf[1] - cf[2] - cf[3])*(1.0 + cf[2] + (cf[1] - cf[3])*cf[3])); - tsM[0] = sc*(-cf[3]*cf[1] + 1.0 - cf[3]*cf[3] - cf[2]); - tsM[1] = sc*((cf[3] + cf[1])*(cf[2] + cf[3]*cf[1])); - tsM[2] = sc*(cf[3]*(cf[1] + cf[3]*cf[2])); - tsM[3] = sc*(cf[1] + cf[3]*cf[2]); - tsM[4] = sc*(-(cf[2] - 1.0)*(cf[2] + cf[3]*cf[1])); - tsM[5] = sc*(-(cf[3]*cf[1] + cf[3]*cf[3] + cf[2] - 1.0)*cf[3]); - tsM[6] = sc*(cf[3]*cf[1] + cf[2] + cf[1]*cf[1] - cf[2]*cf[2]); - tsM[7] = sc*(cf[1]*cf[2] + cf[3]*cf[2]*cf[2] - cf[1]*cf[3]*cf[3] - cf[3]*cf[3]*cf[3] - cf[3]*cf[2] + cf[3]); - tsM[8] = sc*(cf[3]*(cf[1] + cf[3]*cf[2])); - -#define YVV(L) \ -{ \ - W[0] = cf[0]*X[0] + cf[1]*X[0] + cf[2]*X[0] + cf[3]*X[0]; \ - W[1] = cf[0]*X[1] + cf[1]*W[0] + cf[2]*X[0] + cf[3]*X[0]; \ - W[2] = cf[0]*X[2] + cf[1]*W[1] + cf[2]*W[0] + cf[3]*X[0]; \ - for (i=3; i<L; i++) \ - W[i] = cf[0]*X[i] + cf[1]*W[i-1] + cf[2]*W[i-2] + cf[3]*W[i-3]; \ - tsu[0] = W[L-1] - X[L-1]; \ - tsu[1] = W[L-2] - X[L-1]; \ - tsu[2] = W[L-3] - X[L-1]; \ - tsv[0] = tsM[0]*tsu[0] + tsM[1]*tsu[1] + tsM[2]*tsu[2] + X[L-1]; \ - tsv[1] = tsM[3]*tsu[0] + tsM[4]*tsu[1] + tsM[5]*tsu[2] + X[L-1]; \ - tsv[2] = tsM[6]*tsu[0] + tsM[7]*tsu[1] + tsM[8]*tsu[2] + X[L-1]; \ - Y[L-1] = cf[0]*W[L-1] + cf[1]*tsv[0] + cf[2]*tsv[1] + cf[3]*tsv[2]; \ - Y[L-2] = cf[0]*W[L-2] + cf[1]*Y[L-1] + cf[2]*tsv[0] + cf[3]*tsv[1]; \ - Y[L-3] = cf[0]*W[L-3] + cf[1]*Y[L-2] + cf[2]*Y[L-1] + cf[3]*tsv[0]; \ - for (i=L-4; i>=0; i--) \ - Y[i] = cf[0]*W[i] + cf[1]*Y[i+1] + cf[2]*Y[i+2] + cf[3]*Y[i+3]; \ + sc = cf[0] / ((1.0 + cf[1] - cf[2] + cf[3]) * (1.0 - cf[1] - cf[2] - cf[3]) * (1.0 + cf[2] + (cf[1] - cf[3]) * cf[3])); + tsM[0] = sc * (-cf[3] * cf[1] + 1.0 - cf[3] * cf[3] - cf[2]); + tsM[1] = sc * ((cf[3] + cf[1]) * (cf[2] + cf[3] * cf[1])); + tsM[2] = sc * (cf[3] * (cf[1] + cf[3] * cf[2])); + tsM[3] = sc * (cf[1] + cf[3] * cf[2]); + tsM[4] = sc * (-(cf[2] - 1.0) * (cf[2] + cf[3] * cf[1])); + tsM[5] = sc * (-(cf[3] * cf[1] + cf[3] * cf[3] + cf[2] - 1.0) * cf[3]); + tsM[6] = sc * (cf[3] * cf[1] + cf[2] + cf[1] * cf[1] - cf[2] * cf[2]); + tsM[7] = sc * (cf[1] * cf[2] + cf[3] * cf[2] * cf[2] - cf[1] * cf[3] * cf[3] - cf[3] * cf[3] * cf[3] - cf[3] * cf[2] + cf[3]); + tsM[8] = sc * (cf[3] * (cf[1] + cf[3] * cf[2])); + +#define YVV(L) \ +{ \ + W[0] = cf[0] * X[0] + cf[1] * X[0] + cf[2] * X[0] + cf[3] * X[0]; \ + W[1] = cf[0] * X[1] + cf[1] * W[0] + cf[2] * X[0] + cf[3] * X[0]; \ + W[2] = cf[0] * X[2] + cf[1] * W[1] + cf[2] * W[0] + cf[3] * X[0]; \ + for (i = 3; i < L; i++) { \ + W[i] = cf[0] * X[i] + cf[1] * W[i - 1] + cf[2] * W[i - 2] + cf[3] * W[i - 3]; \ + } \ + tsu[0] = W[L - 1] - X[L - 1]; \ + tsu[1] = W[L - 2] - X[L - 1]; \ + tsu[2] = W[L - 3] - X[L - 1]; \ + tsv[0] = tsM[0] * tsu[0] + tsM[1] * tsu[1] + tsM[2] * tsu[2] + X[L - 1]; \ + tsv[1] = tsM[3] * tsu[0] + tsM[4] * tsu[1] + tsM[5] * tsu[2] + X[L - 1]; \ + tsv[2] = tsM[6] * tsu[0] + tsM[7] * tsu[1] + tsM[8] * tsu[2] + X[L - 1]; \ + Y[L - 1] = cf[0] * W[L - 1] + cf[1] * tsv[0] + cf[2] * tsv[1] + cf[3] * tsv[2]; \ + Y[L - 2] = cf[0] * W[L - 2] + cf[1] * Y[L - 1] + cf[2] * tsv[0] + cf[3] * tsv[1]; \ + Y[L - 3] = cf[0] * W[L - 3] + cf[1] * Y[L - 2] + cf[2] * Y[L - 1] + cf[3] * tsv[0]; \ + /* 'i != UINT_MAX' is really 'i >= 0', but necessary for unsigned int wrapping */ \ + for (i = L - 4; i != UINT_MAX; i--) { \ + Y[i] = cf[0] * W[i] + cf[1] * Y[i + 1] + cf[2] * Y[i + 2] + cf[3] * Y[i + 3]; \ + } \ } (void)0 // intermediate buffers - sz = MAX2(src->x, src->y); - X = MEM_callocN(sz*sizeof(double), "IIR_gauss X buf"); - Y = MEM_callocN(sz*sizeof(double), "IIR_gauss Y buf"); - W = MEM_callocN(sz*sizeof(double), "IIR_gauss W buf"); - if (xy & 1) { // H - for (y=0; y<src->y; ++y) { - const int yx = y*src->x; - for (x=0; x<src->x; ++x) - X[x] = src->rect[(x + yx)*src->type + chan]; - YVV(src->x); - for (x=0; x<src->x; ++x) - src->rect[(x + yx)*src->type + chan] = Y[x]; + sz = MAX2(src_width, src_height); + X = MEM_callocN(sz * sizeof(double), "IIR_gauss X buf"); + Y = MEM_callocN(sz * sizeof(double), "IIR_gauss Y buf"); + W = MEM_callocN(sz * sizeof(double), "IIR_gauss W buf"); + if (xy & 1) { // H + for (y = 0; y < src_height; ++y) { + const int yx = y * src_width; + for (x = 0; x < src_width; ++x) + X[x] = src->rect[(x + yx) * src->type + chan]; + YVV(src_width); + for (x = 0; x < src_width; ++x) + src->rect[(x + yx) * src->type + chan] = Y[x]; } } - if (xy & 2) { // V - for (x=0; x<src->x; ++x) { - for (y=0; y<src->y; ++y) - X[y] = src->rect[(x + y*src->x)*src->type + chan]; - YVV(src->y); - for (y=0; y<src->y; ++y) - src->rect[(x + y*src->x)*src->type + chan] = Y[y]; + if (xy & 2) { // V + for (x = 0; x < src_width; ++x) { + for (y = 0; y < src_height; ++y) + X[y] = src->rect[(x + y * src_width) * src->type + chan]; + YVV(src_height); + for (y = 0; y < src_height; ++y) + src->rect[(x + y * src_width) * src->type + chan] = Y[y]; } } |