Cycles Refactor: Add SSE Utility code from Embree for cleaner SSE code.

This makes the code a bit easier to understand, and might come in handy if we want to reuse more Embree code. Differential Revision: https://developer.blender.org/D482 Code by Brecht, with fixes by Lockal, Sergey and myself.
author: Thomas Dinges <blender@dingto.org> 2014-06-13 23:13:18 +0400
committer: Thomas Dinges <blender@dingto.org> 2014-06-13 23:59:12 +0400
commit: cd5e1ff74e4f6443f3e4b836dd23fe46b56cb7ed (patch)
tree: 578ee132eab87d348147e49c91e1929660558c20 /intern/cycles/util/util_color.h
parent: d0573ce9054e325c0ad2fbb943087e0f8b9e159a (diff)
1 files changed, 23 insertions, 23 deletions
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index b72cc6bc873..d566e1bf359 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -155,28 +155,28 @@ ccl_device float3 color_srgb_to_scene_linear(float3 c)
  * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
  */
 template<unsigned exp, unsigned e2coeff>
-ccl_device_inline __m128 fastpow(const __m128 &arg)
+ccl_device_inline ssef fastpow(const ssef &arg)
 {
-	__m128 ret;
-	ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff)));
-	ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
-	ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp)));
-	ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
+	ssef ret;
+	ret = arg * cast(ssei(e2coeff));
+	ret = ssef(cast(ret));
+	ret = ret * cast(ssei(exp));
+	ret = cast(ssei(ret));
 	return ret;
 }
 
 /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
-ccl_device_inline __m128 improve_5throot_solution(const __m128 &old_result, const __m128 &x)
+ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x)
 {
-	__m128 approx2 = _mm_mul_ps(old_result, old_result);
-	__m128 approx4 = _mm_mul_ps(approx2, approx2);
-	__m128 t = _mm_div_ps(x, approx4);
-	__m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */
-	return _mm_mul_ps(summ, _mm_set1_ps(1.0f/5.0f));
+	ssef approx2 = old_result * old_result;
+	ssef approx4 = approx2 * approx2;
+	ssef t = x / approx4;
+	ssef summ = madd(ssef(4.0f), old_result, t);
+	return summ * ssef(1.0f/5.0f);
 }
 
 /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
-ccl_device_inline __m128 fastpow24(const __m128 &arg)
+ccl_device_inline ssef fastpow24(const ssef &arg)
 {
 	/* max, avg and |avg| errors were calculated in gcc without FMA instructions
 	 * The final precision should be better than powf in glibc */
@@ -184,22 +184,22 @@ ccl_device_inline __m128 fastpow24(const __m128 &arg)
 	/* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */
 	/* 0x3F4CCCCD = 4/5 */
 	/* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
-	__m128 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg); // error max = 0.17	avg = 0.0018	|avg| = 0.05
-	__m128 arg2 = _mm_mul_ps(arg, arg);
-	__m128 arg4 = _mm_mul_ps(arg2, arg2);
+	ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg); // error max = 0.17	avg = 0.0018	|avg| = 0.05
+	ssef arg2 = arg * arg;
+	ssef arg4 = arg2 * arg2;
 	x = improve_5throot_solution(x, arg4); /* error max = 0.018		avg = 0.0031	|avg| = 0.0031  */
 	x = improve_5throot_solution(x, arg4); /* error max = 0.00021	avg = 1.6e-05	|avg| = 1.6e-05 */
 	x = improve_5throot_solution(x, arg4); /* error max = 6.1e-07	avg = 5.2e-08	|avg| = 1.1e-07 */
-	return _mm_mul_ps(x, _mm_mul_ps(x, x));
+	return x * (x * x);
 }
 
-ccl_device __m128 color_srgb_to_scene_linear(const __m128 &c)
+ccl_device ssef color_srgb_to_scene_linear(const ssef &c)
 {
-	__m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.04045f));
-	__m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(1.0f/12.92f)), _mm_set1_ps(0.0f));
-	__m128 gtebase = _mm_mul_ps(_mm_add_ps(c, _mm_set1_ps(0.055f)), _mm_set1_ps(1.0f/1.055f)); /* fma */
-	__m128 gte = fastpow24(gtebase);
-	return blend(cmp, lt, gte);
+	sseb cmp = c < ssef(0.04045f);
+	ssef lt = max(c * ssef(1.0f/12.92f), ssef(0.0f));
+	ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f/1.055f); /* fma */
+	ssef gte = fastpow24(gtebase);
+	return select(cmp, lt, gte);
 }
 #endif
author	Thomas Dinges <blender@dingto.org>	2014-06-13 23:13:18 +0400
committer	Thomas Dinges <blender@dingto.org>	2014-06-13 23:59:12 +0400
commit	cd5e1ff74e4f6443f3e4b836dd23fe46b56cb7ed (patch)
tree	578ee132eab87d348147e49c91e1929660558c20 /intern/cycles/util/util_color.h
parent	d0573ce9054e325c0ad2fbb943087e0f8b9e159a (diff)