Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2014-06-13 22:30:13 +0400
committerSergey Sharybin <sergey.vfx@gmail.com>2014-06-13 22:38:07 +0400
commita87fb34edaf1a10f5527b6dc8a506a1c9ecbc683 (patch)
tree06386145cbf7f9dcf6684b3a39722ed4d4e62c4d /source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp
parentb0708dd7189dfef21f7f9af5e98b0a7e1369e507 (diff)
Use advantage of SSE2 instructions in gaussian blur node
This gives around 30% of speedup for gaussian blur node. Pretty much straightforward implementation inside the node itself, but needed to implement some additional things: - Aligned malloc. It's needed to load data onto SSE registers faster. based on the aligned_malloc() from Libmv with some additional trickery going on to support arbitrary alignment (this magic is needed because of MemHead). In the practice only 16bit alignment is supported because of the lack of aligned malloc with arbitrary alignment for OSX. Not a bit deal for now because we need 16 bytes alignment at this moment only. Could be tweaked further later. - Memory buffers in compositor are now aligned to 16 bytes. Should be harmless for non-SSE cases too. just mentioning. Reviewers: campbellbarton, lukastoenne, jbakker Reviewed By: campbellbarton CC: lockal Differential Revision: https://developer.blender.org/D564
Diffstat (limited to 'source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp')
-rw-r--r--source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp37
1 files changed, 35 insertions, 2 deletions
diff --git a/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp b/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp
index d08924ca4ef..0aefba3bb7c 100644
--- a/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp
+++ b/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp
@@ -31,6 +31,9 @@ extern "C" {
GaussianXBlurOperation::GaussianXBlurOperation() : BlurBaseOperation(COM_DT_COLOR)
{
this->m_gausstab = NULL;
+#ifdef __SSE2__
+ this->m_gausstab_sse = NULL;
+#endif
this->m_filtersize = 0;
}
@@ -54,8 +57,14 @@ void GaussianXBlurOperation::initExecution()
if (this->m_sizeavailable) {
float rad = max_ff(m_size * m_data.sizex, 0.0f);
m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-
+
+ /* TODO(sergey): De-duplicate with the case below and Y blur. */
this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+ this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+ rad,
+ m_filtersize);
+#endif
}
}
@@ -65,8 +74,13 @@ void GaussianXBlurOperation::updateGauss()
updateSize();
float rad = max_ff(m_size * m_data.sizex, 0.0f);
m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-
+
this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+ this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+ rad,
+ m_filtersize);
+#endif
}
}
@@ -88,12 +102,25 @@ void GaussianXBlurOperation::executePixel(float output[4], int x, int y, void *d
int step = getStep();
int offsetadd = getOffsetAdd();
int bufferindex = ((xmin - bufferstartx) * 4) + ((ymin - bufferstarty) * 4 * bufferwidth);
+
+#ifdef __SSE2__
+ __m128 accum_r = _mm_load_ps(color_accum);
+ for (int nx = xmin, index = (xmin - x) + this->m_filtersize; nx < xmax; nx += step, index += step) {
+ __m128 reg_a = _mm_load_ps(&buffer[bufferindex]);
+ reg_a = _mm_mul_ps(reg_a, this->m_gausstab_sse[index]);
+ accum_r = _mm_add_ps(accum_r, reg_a);
+ multiplier_accum += this->m_gausstab[index];
+ bufferindex += offsetadd;
+ }
+ _mm_store_ps(color_accum, accum_r);
+#else
for (int nx = xmin, index = (xmin - x) + this->m_filtersize; nx < xmax; nx += step, index += step) {
const float multiplier = this->m_gausstab[index];
madd_v4_v4fl(color_accum, &buffer[bufferindex], multiplier);
multiplier_accum += multiplier;
bufferindex += offsetadd;
}
+#endif
mul_v4_v4fl(output, color_accum, 1.0f / multiplier_accum);
}
@@ -105,6 +132,12 @@ void GaussianXBlurOperation::deinitExecution()
MEM_freeN(this->m_gausstab);
this->m_gausstab = NULL;
}
+#ifdef __SSE2__
+ if (this->m_gausstab_sse) {
+ MEM_freeN(this->m_gausstab_sse);
+ this->m_gausstab_sse = NULL;
+ }
+#endif
deinitMutex();
}