Cycles: less instructions for CPU perlin code

Also move shuffle() in bvh traversal back (was accidentally lost in SSE hair revert)
author: Sv. Lockal <lockalsash@gmail.com> 2014-02-01 14:07:53 +0400
committer: Sv. Lockal <lockalsash@gmail.com> 2014-02-01 14:07:53 +0400
commit: 1ff299b3179788cb82e1a660796101f7b57ceefc (patch)
tree: a53c090511d324f4e53a1e1233014f477fb72ded /intern
parent: 62a3fe23234b931d01a53871ef6a1265fd0a9235 (diff)
3 files changed, 27 insertions, 12 deletions
diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h
index 1e898abc464..1ee1fbc3cb4 100644
--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ b/intern/cycles/kernel/kernel_bvh_traversal.h
@@ -168,7 +168,7 @@ ccl_device bool BVH_FUNCTION_NAME
 				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]);
 
 				const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
-				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle_swap(tminmax, shuf_swap));
+				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
 
 				/* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index fb749ab778d..282ad191470 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -260,22 +260,22 @@ ccl_device_noinline float perlin(float x, float y, float z)
 	__m128 uvw = fade_sse(&fxyz);
 	__m128 u = broadcast<0>(uvw), v = broadcast<1>(uvw), w = broadcast<2>(uvw);
 
-	__m128i ci[] = {_mm_setr_epi32(1, 1, 1, 1), _mm_setr_epi32(0, 0, 1, 1), _mm_setr_epi32(0, 1, 0, 1)};
-	__m128i vp[] = {broadcast<0>(XYZ), broadcast<1>(XYZ), broadcast<2>(XYZ)};
-	__m128i vd[] = {_mm_add_epi32(vp[0], ci[0]), _mm_add_epi32(vp[1], ci[1]), _mm_add_epi32(vp[2], ci[2])};
+	__m128i XYZ_ofc = _mm_add_epi32(XYZ, _mm_set1_epi32(1));
+	__m128i vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc);                      // +0, +0, +1, +1
+	__m128i vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1
 
-	__m128i h1 = hash_sse(vp[0], vd[1], vd[2]);    // hash directions 000, 001, 010, 011 (vp[0] is not a typo, because vp[0]+0 == vp[0])
-	__m128i h2 = hash_sse(vd[0], vd[1], vd[2]);    // hash directions 100, 101, 110, 111
+	__m128i h1 = hash_sse(broadcast<0>(XYZ),     vdy, vdz);               // hash directions 000, 001, 010, 011
+	__m128i h2 = hash_sse(broadcast<0>(XYZ_ofc), vdy, vdz);               // hash directions 100, 101, 110, 111
 
-	__m128 cf[] = {_mm_setr_ps(1.0f, 1.0f, 1.0f, 1.0f), _mm_setr_ps(0.0f, 0.0f, 1.0f, 1.0f), _mm_setr_ps(0.0f, 1.0f, 0.0f, 1.0f)};
-	__m128 vf[] = {broadcast<0>(fxyz), broadcast<1>(fxyz),  broadcast<2>(fxyz)};
-	__m128 vfd[] = {_mm_sub_ps(vf[0], cf[0]), _mm_sub_ps(vf[1], cf[1]), _mm_sub_ps(vf[2], cf[2])};
+	__m128 fxyz_ofc = _mm_sub_ps(fxyz, _mm_set1_ps(1.0f));
+	__m128 vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc);
+	__m128 vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc));
 
-	__m128 g1 = grad_sse(h1, vf[0],  vfd[1], vfd[2]); // vf is not a typo (same as above)
-	__m128 g2 = grad_sse(h2, vfd[0], vfd[1], vfd[2]);
+	__m128 g1 = grad_sse(h1, broadcast<0>(fxyz),     vfy, vfz);
+	__m128 g2 = grad_sse(h2, broadcast<0>(fxyz_ofc), vfy, vfz);
 	__m128 n1 = nerp_sse(u, g1, g2);
 
-	__m128 n1_half = _mm_movehl_ps(n1, n1);        // extract 2 floats to a separate vector
+	__m128 n1_half = shuffle<2, 3, 2, 3>(n1);      // extract 2 floats to a separate vector
 	__m128 n2 = nerp_sse(v, n1, n1_half);          // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]
 
 	__m128 n2_second = broadcast<1>(n2);           // extract b to a separate vector
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index c8f794adfb3..ac4e38ec1b8 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -79,11 +79,26 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m
 	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
 }
 
+template<> __forceinline const __m128 shuffle<0, 1, 0, 1>(const __m128& a)
+{
+	return _mm_movelh_ps(a, a);
+}
+
+template<> __forceinline const __m128 shuffle<2, 3, 2, 3>(const __m128& a)
+{
+	return _mm_movehl_ps(a, a);
+}
+
 template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128i shuffle(const __m128i& a)
 {
 	return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
 }
 
+template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128i shuffle(const __m128i& a, const __m128i& b)
+{
+	return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
 /* Blend 2 vectors based on mask: (a[i] & mask[i]) | (b[i] & ~mask[i]) */
 #ifdef __KERNEL_SSE41__
 ccl_device_inline const __m128 blend(const __m128& mask, const __m128& a, const __m128& b)
author	Sv. Lockal <lockalsash@gmail.com>	2014-02-01 14:07:53 +0400
committer	Sv. Lockal <lockalsash@gmail.com>	2014-02-01 14:07:53 +0400
commit	1ff299b3179788cb82e1a660796101f7b57ceefc (patch)
tree	a53c090511d324f4e53a1e1233014f477fb72ded /intern
parent	62a3fe23234b931d01a53871ef6a1265fd0a9235 (diff)