Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorSv. Lockal <lockalsash@gmail.com>2014-01-06 19:53:38 +0400
committerSv. Lockal <lockalsash@gmail.com>2014-01-06 20:03:30 +0400
commit96903508bc3faec99bac8007e344016698630aae (patch)
tree1f7faea7cf8735ecc55d7049b939233fcbb0e8e0 /intern
parente0a4a4afc31e101bac6238ac4e0785935610a862 (diff)
Cycles: SSE optimization for sRGB conversion (gives 7% speedup on CPU for pavillon_barcelone scene)
Thanks brecht/dingto/juicyfruit et al. for testing and reviewing this patch in T38034.
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/svm/svm_image.h7
-rw-r--r--intern/cycles/util/util_color.h58
-rw-r--r--intern/cycles/util/util_simd.h13
3 files changed, 78 insertions, 0 deletions
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 58e5775265a..45a2a8b802c 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -247,9 +247,16 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
}
if(srgb) {
+#ifdef __KERNEL_SSE2__
+ float alpha = r.w;
+ __m128 *rv = (__m128 *)&r;
+ *rv = color_srgb_to_scene_linear(*rv);
+ r.w = alpha;
+#else
r.x = color_srgb_to_scene_linear(r.x);
r.y = color_srgb_to_scene_linear(r.y);
r.z = color_srgb_to_scene_linear(r.z);
+#endif
}
return r;
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index 8b13a006673..4e195506421 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -19,6 +19,7 @@
#include "util_math.h"
#include "util_types.h"
+#include "util_simd.h"
CCL_NAMESPACE_BEGIN
@@ -142,6 +143,63 @@ ccl_device float3 color_srgb_to_scene_linear(float3 c)
color_srgb_to_scene_linear(c.z));
}
+#ifdef __KERNEL_SSE2__
+/*
+ * Calculate initial guess for arg^exp based on float representation
+ * This method gives a constant bias, which can be easily compensated by multiplicating with bias_coeff.
+ * Gives better results for exponents near 1 (e. g. 4/5).
+ * exp = exponent, encoded as uint32_t
+ * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
+ */
+template<unsigned exp, unsigned e2coeff>
+ccl_device_inline __m128 fastpow(const __m128 &arg)
+{
+ __m128 ret;
+ ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff)));
+ ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
+ ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp)));
+ ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
+ return ret;
+}
+
+/* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
+ccl_device_inline __m128 improve_5throot_solution(const __m128 &old_result, const __m128 &x)
+{
+ __m128 approx2 = _mm_mul_ps(old_result, old_result);
+ __m128 approx4 = _mm_mul_ps(approx2, approx2);
+ __m128 t = _mm_div_ps(x, approx4);
+ __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */
+ return _mm_mul_ps(summ, _mm_set1_ps(1.0f/5.0f));
+}
+
+/* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
+ccl_device_inline __m128 fastpow24(const __m128 &arg)
+{
+ /* max, avg and |avg| errors were calculated in gcc without FMA instructions
+ * The final precision should be better than powf in glibc */
+
+ /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */
+ /* 0x3F4CCCCD = 4/5 */
+ /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
+ __m128 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg); // error max = 0.17 avg = 0.0018 |avg| = 0.05
+ __m128 arg2 = _mm_mul_ps(arg, arg);
+ __m128 arg4 = _mm_mul_ps(arg2, arg2);
+ x = improve_5throot_solution(x, arg4); /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */
+ x = improve_5throot_solution(x, arg4); /* error max = 0.00021 avg = 1.6e-05 |avg| = 1.6e-05 */
+ x = improve_5throot_solution(x, arg4); /* error max = 6.1e-07 avg = 5.2e-08 |avg| = 1.1e-07 */
+ return _mm_mul_ps(x, _mm_mul_ps(x, x));
+}
+
+ccl_device __m128 color_srgb_to_scene_linear(const __m128 &c)
+{
+ __m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.04045f));
+ __m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(1.0f/12.92f)), _mm_set1_ps(0.0f));
+ __m128 gtebase = _mm_mul_ps(_mm_add_ps(c, _mm_set1_ps(0.055f)), _mm_set1_ps(1.0f/1.055f)); /* fma */
+ __m128 gte = fastpow24(gtebase);
+ return blend(cmp, lt, gte);
+}
+#endif
+
ccl_device float3 color_scene_linear_to_srgb(float3 c)
{
return make_float3(
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 119ea2ac211..02e2880212e 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -79,6 +79,19 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
}
+/* Blend 2 vectors based on mask: (a[i] & mask[i]) | (b[i] & ~mask[i]) */
+#ifdef __KERNEL_SSE41__
+ccl_device_inline const __m128 blend(const __m128& mask, const __m128& a, const __m128& b)
+{
+ return _mm_blendv_ps(b, a, mask);
+}
+#else
+ccl_device_inline const __m128 blend(const __m128& mask, const __m128& a, const __m128& b)
+{
+ return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
+}
+#endif
+
#endif /* __KERNEL_SSE2__ */
CCL_NAMESPACE_END