Cycles: test code for sse 4.1 kernel and alignment for some vector types.

This is mostly work towards enabling the __KERNEL_SSE__ option to start using SIMD operations for vector math operations. This 4.1 kernel performes about 8% faster with that option but overall is still slower than without the option. WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 is the cmake flag for testing this kernel. Alignment of int3, int4, float3, float4 to 16 bytes seems to give a slight 1-2% speedup on tested systems with the current kernel already, so is enabled now.
author: Martijn Berger <martijn.berger@gmail.com> 2013-11-22 17:16:47 +0400
committer: Brecht Van Lommel <brechtvanlommel@gmail.com> 2013-11-22 17:42:41 +0400
commit: e3a79258d17e6cdca26120eab7a2c48c7c4d4a0f (patch)
tree: 77d59694458125dd7525faf59ed56ce505533981 /intern/cycles/util/util_math.h
parent: 5feb0d2bfe8f6723bf48073b1760b732bc6a5ceb (diff)
1 files changed, 13 insertions, 0 deletions
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 6db532faf74..851c67b1189 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -462,7 +462,11 @@ ccl_device_inline float3 operator/=(float3& a, float f)
 
 ccl_device_inline float dot(const float3 a, const float3 b)
 {
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
+#else	
 	return a.x*b.x + a.y*b.y + a.z*b.z;
+#endif
 }
 
 ccl_device_inline float3 cross(const float3 a, const float3 b)
@@ -475,7 +479,11 @@ ccl_device_inline float3 cross(const float3 a, const float3 b)
 
 ccl_device_inline float len(const float3 a)
 {
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
+#else
 	return sqrtf(dot(a, a));
+#endif
 }
 
 ccl_device_inline float len_squared(const float3 a)
@@ -487,7 +495,12 @@ ccl_device_inline float len_squared(const float3 a)
 
 ccl_device_inline float3 normalize(const float3 a)
 {
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
+	return _mm_div_ps(a.m128, norm);
+#else
 	return a/len(a);
+#endif
 }
 
 #endif
author	Martijn Berger <martijn.berger@gmail.com>	2013-11-22 17:16:47 +0400
committer	Brecht Van Lommel <brechtvanlommel@gmail.com>	2013-11-22 17:42:41 +0400
commit	e3a79258d17e6cdca26120eab7a2c48c7c4d4a0f (patch)
tree	77d59694458125dd7525faf59ed56ce505533981 /intern/cycles/util/util_math.h
parent	5feb0d2bfe8f6723bf48073b1760b732bc6a5ceb (diff)