From 47c5898fa15b9e82ddfe28891029101eaf1e67d2 Mon Sep 17 00:00:00 2001
From: "Sv. Lockal" <lockalsash@gmail.com>
Date: Sun, 12 Jan 2014 18:14:00 +0400
Subject: Cycles: SSE for Voronoi textures (targeted for Haswell CPUs)

Gives up to 15% speedup scenes with voronoi-based textures (up to 25% with volumes) on Haswell. The performance change for other CPUs is much smaller: 1-2%.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D203
---
 intern/cycles/kernel/svm/svm_noise.h | 90 ++++++++++++++++++++++--------------
 intern/cycles/util/util_simd.h       | 19 +++++++-
 2 files changed, 72 insertions(+), 37 deletions(-)

diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 6ad10b7c8db..d431934f6ce 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -32,24 +32,31 @@
 
 CCL_NAMESPACE_BEGIN
 
+#ifndef __KERNEL_SSE2__
 ccl_device int quick_floor(float x)
 {
 	return float_to_int(x) - ((x < 0) ? 1 : 0);
 }
-
-#ifdef __KERNEL_SSE2__
-ccl_device_inline __m128i quick_floor_sse(const __m128 *x)
+#else
+ccl_device_inline __m128i quick_floor_sse(const __m128 &x)
 {
-	__m128i b = _mm_cvttps_epi32(*x);
-	__m128i isneg = _mm_castps_si128(_mm_cmplt_ps(*x, _mm_set1_ps(0.0f)));
+	__m128i b = _mm_cvttps_epi32(x);
+	__m128i isneg = _mm_castps_si128(_mm_cmplt_ps(x, _mm_set1_ps(0.0f)));
 	return _mm_add_epi32(b, isneg); // unsaturated add 0xffffffff is the same as subtract -1
 }
 #endif
 
+#ifndef __KERNEL_SSE2__
 ccl_device float bits_to_01(uint bits)
 {
 	return bits * (1.0f/(float)0xFFFFFFFF);
 }
+#else
+ccl_device_inline __m128 bits_to_01_sse(const __m128i &bits)
+{
+	return _mm_mul_ps(uint32_to_float(bits), _mm_set1_ps(1.0f/(float)0xFFFFFFFF));
+}
+#endif
 
 ccl_device uint hash(uint kx, uint ky, uint kz)
 {
@@ -81,16 +88,16 @@ ccl_device uint hash(uint kx, uint ky, uint kz)
 }
 
 #ifdef __KERNEL_SSE2__
-ccl_device_inline __m128i hash_sse(const __m128i *kx, const __m128i *ky, const __m128i *kz)
+ccl_device_inline __m128i hash_sse(const __m128i &kx, const __m128i &ky, const __m128i &kz)
 {
 #define rot(x,k) _mm_or_si128(_mm_slli_epi32((x), (k)), _mm_srli_epi32((x), 32-(k)))
 #define xor_rot(a, b, c) do {a = _mm_xor_si128(a, b); a = _mm_sub_epi32(a, rot(b, c));} while(0)
 
 	uint len = 3;
 	__m128i magic = _mm_set1_epi32(0xdeadbeef + (len << 2) + 13);
-	__m128i a = _mm_add_epi32(magic, *kx);
-	__m128i b = _mm_add_epi32(magic, *ky);
-	__m128i c = _mm_add_epi32(magic, *kz);
+	__m128i a = _mm_add_epi32(magic, kx);
+	__m128i b = _mm_add_epi32(magic, ky);
+	__m128i c = _mm_add_epi32(magic, kz);
 
 	xor_rot(c, b, 14);
 	xor_rot(a, c, 11);
@@ -126,10 +133,10 @@ ccl_device float floorfrac(float x, int* i)
 	return x - *i;
 }
 #else
-ccl_device_inline __m128 floorfrac_sse(const __m128 *x, __m128i *i)
+ccl_device_inline __m128 floorfrac_sse(const __m128 &x, __m128i *i)
 {
 	*i = quick_floor_sse(x);
-	return _mm_sub_ps(*x, _mm_cvtepi32_ps(*i));
+	return _mm_sub_ps(x, _mm_cvtepi32_ps(*i));
 }
 #endif
 
@@ -153,10 +160,10 @@ ccl_device float nerp(float t, float a, float b)
 	return (1.0f - t) * a + t * b;
 }
 #else
-ccl_device_inline __m128 nerp_sse(const __m128 *t, const __m128 *a, const __m128 *b)
+ccl_device_inline __m128 nerp_sse(const __m128 &t, const __m128 &a, const __m128 &b)
 {
-	__m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), *t), *a);
-	return fma(*t, *b, x1);
+	__m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), t), a);
+	return fma(t, b, x1);
 }
 #endif
 
@@ -171,16 +178,16 @@ ccl_device float grad(int hash, float x, float y, float z)
 	return ((h&1) ? -u : u) + ((h&2) ? -v : v);
 }
 #else
-ccl_device_inline __m128 grad_sse(const __m128i *hash, const __m128 *x, const __m128 *y, const __m128 *z)
+ccl_device_inline __m128 grad_sse(const __m128i &hash, const __m128 &x, const __m128 &y, const __m128 &z)
 {
 	__m128i c1 = _mm_set1_epi32(1);
 	__m128i c2 = _mm_set1_epi32(2);
 
-	__m128i h = _mm_and_si128(*hash, _mm_set1_epi32(15));          // h = hash & 15
+	__m128i h = _mm_and_si128(hash, _mm_set1_epi32(15));          // h = hash & 15
 
 	__m128i case_ux = _mm_cmplt_epi32(h, _mm_set1_epi32(8));       // 0xffffffff if h < 8 else 0
 
-	__m128 u = blend(_mm_castsi128_ps(case_ux), *x, *y);           // u = h<8 ? x : y
+	__m128 u = blend(_mm_castsi128_ps(case_ux), x, y);             // u = h<8 ? x : y
 
 	__m128i case_vy = _mm_cmplt_epi32(h, _mm_set1_epi32(4));       // 0xffffffff if h < 4 else 0
 
@@ -189,7 +196,7 @@ ccl_device_inline __m128 grad_sse(const __m128i *hash, const __m128 *x, const __
 
 	__m128i case_vx = _mm_or_si128(case_h12, case_h14);            // 0xffffffff if h == 12 or h == 14 else 0
 
-	__m128 v = blend(_mm_castsi128_ps(case_vy), *y, blend(_mm_castsi128_ps(case_vx), *x, *z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z
+	__m128 v = blend(_mm_castsi128_ps(case_vy), y, blend(_mm_castsi128_ps(case_vx), x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z
 
 	__m128i case_uneg = _mm_slli_epi32(_mm_and_si128(h, c1), 31);  // 1<<31 if h&1 else 0
 	__m128 case_uneg_mask = _mm_castsi128_ps(case_uneg);           // -0.0 if h&1 else +0.0
@@ -210,9 +217,9 @@ ccl_device float scale3(float result)
 	return 0.9820f * result;
 }
 #else
-ccl_device_inline __m128 scale3_sse(const __m128 *result)
+ccl_device_inline __m128 scale3_sse(const __m128 &result)
 {
-	return _mm_mul_ps(_mm_set1_ps(0.9820f), *result);
+	return _mm_mul_ps(_mm_set1_ps(0.9820f), result);
 }
 #endif
 
@@ -248,7 +255,7 @@ ccl_device_noinline float perlin(float x, float y, float z)
 	__m128 xyz = _mm_setr_ps(x, y, z, 0.0f);
 	__m128i XYZ;
 
-	__m128 fxyz = floorfrac_sse(&xyz, &XYZ);
+	__m128 fxyz = floorfrac_sse(xyz, &XYZ);
 
 	__m128 uvw = fade_sse(&fxyz);
 	__m128 u = broadcast<0>(uvw), v = broadcast<1>(uvw), w = broadcast<2>(uvw);
@@ -257,24 +264,24 @@ ccl_device_noinline float perlin(float x, float y, float z)
 	__m128i vp[] = {broadcast<0>(XYZ), broadcast<1>(XYZ), broadcast<2>(XYZ)};
 	__m128i vd[] = {_mm_add_epi32(vp[0], ci[0]), _mm_add_epi32(vp[1], ci[1]), _mm_add_epi32(vp[2], ci[2])};
 
-	__m128i h1 = hash_sse(vp, vd+1, vd+2);         // hash directions 000, 001, 010, 011 (vp[0] is not a typo, because vp[0]+0 == vp[0])
-	__m128i h2 = hash_sse(vd, vd+1, vd+2);         // hash directions 100, 101, 110, 111
+	__m128i h1 = hash_sse(vp[0], vd[1], vd[2]);    // hash directions 000, 001, 010, 011 (vp[0] is not a typo, because vp[0]+0 == vp[0])
+	__m128i h2 = hash_sse(vd[0], vd[1], vd[2]);    // hash directions 100, 101, 110, 111
 
 	__m128 cf[] = {_mm_setr_ps(1.0f, 1.0f, 1.0f, 1.0f), _mm_setr_ps(0.0f, 0.0f, 1.0f, 1.0f), _mm_setr_ps(0.0f, 1.0f, 0.0f, 1.0f)};
 	__m128 vf[] = {broadcast<0>(fxyz), broadcast<1>(fxyz),  broadcast<2>(fxyz)};
 	__m128 vfd[] = {_mm_sub_ps(vf[0], cf[0]), _mm_sub_ps(vf[1], cf[1]), _mm_sub_ps(vf[2], cf[2])};
 
-	__m128 g1 = grad_sse(&h1, vf,  vfd+1, vfd+2);  // vf is not a typo (same as above)
-	__m128 g2 = grad_sse(&h2, vfd, vfd+1, vfd+2);
-	__m128 n1 = nerp_sse(&u, &g1, &g2);
+	__m128 g1 = grad_sse(h1, vf[0],  vfd[1], vfd[2]); // vf is not a typo (same as above)
+	__m128 g2 = grad_sse(h2, vfd[0], vfd[1], vfd[2]);
+	__m128 n1 = nerp_sse(u, g1, g2);
 
 	__m128 n1_half = _mm_movehl_ps(n1, n1);        // extract 2 floats to a separate vector
-	__m128 n2 = nerp_sse(&v, &n1, &n1_half);       // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]
+	__m128 n2 = nerp_sse(v, n1, n1_half);          // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]
 
 	__m128 n2_second = broadcast<1>(n2);           // extract b to a separate vector
-	__m128 result = nerp_sse(&w, &n2, &n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _]
+	__m128 result = nerp_sse(w, n2, n2_second);    // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _]
 
-	__m128 r = scale3_sse(&result);
+	__m128 r = scale3_sse(result);
 
 	__m128 infmask = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
 	__m128 rinfmask = _mm_cmpeq_ps(_mm_and_ps(r, infmask), infmask); // 0xffffffff if r is inf/-inf/nan else 0
@@ -318,20 +325,21 @@ ccl_device_noinline float perlin_periodic(float x, float y, float z, float3 pper
 #endif
 
 /* perlin noise in range 0..1 */
-ccl_device float noise(float3 p)
+ccl_device float noise(const float3 &p)
 {
 	float r = perlin(p.x, p.y, p.z);
 	return 0.5f*r + 0.5f;
 }
 
 /* perlin noise in range -1..1 */
-ccl_device float snoise(float3 p)
+ccl_device float snoise(const float3 &p)
 {
 	return perlin(p.x, p.y, p.z);
 }
 
 /* cell noise */
-ccl_device_noinline float cellnoise(float3 p)
+#ifndef __KERNEL_SSE2__
+ccl_device_noinline float cellnoise(const float3 &p)
 {
 	uint ix = quick_floor(p.x);
 	uint iy = quick_floor(p.y);
@@ -340,7 +348,7 @@ ccl_device_noinline float cellnoise(float3 p)
 	return bits_to_01(hash(ix, iy, iz));
 }
 
-ccl_device float3 cellnoise_color(float3 p)
+ccl_device float3 cellnoise_color(const float3 &p)
 {
 	float r = cellnoise(p);
 	float g = cellnoise(make_float3(p.y, p.x, p.z));
@@ -348,17 +356,29 @@ ccl_device float3 cellnoise_color(float3 p)
 
 	return make_float3(r, g, b);
 }
+#else
+ccl_device float3 cellnoise_color(const float3 &p)
+{
+	__m128i v_yxz = quick_floor_sse(_mm_setr_ps(p.y, p.x, p.z, 0.0f));
+	__m128i v_xyy = shuffle<1, 0, 0, 3>(v_yxz);
+	__m128i v_zzx = shuffle<2, 2, 1, 3>(v_yxz);
+	__m128 rgb = bits_to_01_sse(hash_sse(v_xyy, v_yxz, v_zzx));
+
+	float3 result = *(float3*)&rgb;
+	return result;
+}
+#endif
 
 #if 0 // unused
 /* periodic perlin noise in range 0..1 */
-ccl_device float pnoise(float3 p, float3 pperiod)
+ccl_device float pnoise(const float3 &p, const float3 &pperiod)
 {
 	float r = perlin_periodic(p.x, p.y, p.z, pperiod);
 	return 0.5f*r + 0.5f;
 }
 
 /* periodic perlin noise in range -1..1 */
-ccl_device float psnoise(float3 p, float3 pperiod)
+ccl_device float psnoise(const float3 &p, const float3 &pperiod)
 {
 	return perlin_periodic(p.x, p.y, p.z, pperiod);
 }
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index a6880e77054..c8f794adfb3 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -74,9 +74,14 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m
 	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
 }
 
-template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128 shuffle(const __m128& b)
+template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128 shuffle(const __m128& a)
 {
-	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128i shuffle(const __m128i& a)
+{
+	return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
 }
 
 /* Blend 2 vectors based on mask: (a[i] & mask[i]) | (b[i] & ~mask[i]) */
@@ -108,6 +113,16 @@ template<size_t N> ccl_device_inline const __m128i broadcast(const __m128i& a)
 	return _mm_shuffle_epi32(a, _MM_SHUFFLE(N, N, N, N));
 }
 
+ccl_device_inline const __m128 uint32_to_float(const __m128i &in)
+{
+	__m128i a = _mm_srli_epi32(in, 16);
+	__m128i b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff));
+	__m128i c = _mm_or_si128(a, _mm_set1_epi32(0x53000000));
+	__m128 d = _mm_cvtepi32_ps(b);
+	__m128 e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000)));
+	return _mm_add_ps(e, d);
+}
+
 #endif /* __KERNEL_SSE2__ */
 
 CCL_NAMESPACE_END
-- 
cgit v1.2.3