Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.

On the BMW scene, this gives roughly a 10% speedup overall with clang/gcc, and 30% speedup with visual studio (2008). It turns out visual studio was optimizing the existing code quite poorly compared to pretty good autovectorization by clang/gcc, but hand written SSE code also gives a smaller speed boost there. This code isn't enabled when using the hair minimum width feature yet, need to make that work with the SSE code still.
author: Brecht Van Lommel <brechtvanlommel@pandora.be> 2013-06-18 13:36:06 +0400
committer: Brecht Van Lommel <brechtvanlommel@pandora.be> 2013-06-18 13:36:06 +0400
commit: d57c6748c4ebb37246caf25d4900ef6d5c16c0fe (patch)
tree: 08491bec3d7310f7df1e2171c8fb44a68d508a90 /intern/cycles/util/util_types.h
parent: 9131adca9f748f794c18c71d36f830a961c218b4 (diff)
1 files changed, 56 insertions, 1 deletions
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 472a707d8fd..d4ff95b0663 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -72,13 +72,21 @@
 #include <tmmintrin.h> /* SSE 3 */
 #include <smmintrin.h> /* SSE 4 */
 
+#ifndef __KERNEL_SSE2__
 #define __KERNEL_SSE2__
+#endif
+
+#ifndef __KERNEL_SSE3__
 #define __KERNEL_SSE3__
+#endif
+
+#ifndef __KERNEL_SSE4__
 #define __KERNEL_SSE4__
+#endif
 
 #else
 
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__KERNEL_SSE3__)
 
 /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
  * Since we can't avoid including <windows.h>, better only include that */
@@ -87,9 +95,16 @@
 #else
 #include <xmmintrin.h> /* SSE 1 */
 #include <emmintrin.h> /* SSE 2 */
+
+#ifdef __KERNEL_SSE3__
+#include <pmmintrin.h> /* SSE 3 */
+#include <tmmintrin.h> /* SSE 3 */
+#endif
 #endif
 
+#ifndef __KERNEL_SSE2__
 #define __KERNEL_SSE2__
+#endif
 
 #endif
 
@@ -471,6 +486,46 @@ __device_inline int4 make_int4(const float3& f)
 
 #endif
 
+#ifdef __KERNEL_SSE3__
+
+/* SSE shuffle utility functions */
+
+__device_inline const __m128 shuffle8(const __m128& a, const __m128i& shuf)
+{
+	return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& a, const __m128& b)
+{
+	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& b)
+{
+	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+#endif
+
+#if defined(__KERNEL_SSE2__) && defined(_MSC_VER)
+
+/* count zeros from start or end of integer bits */
+
+__device_inline uint32_t __builtin_ctz(uint32_t i)
+{
+	unsigned long r = 0;
+	_BitScanForward(&r, i);
+	return (uint32_t)r;
+}
+
+__device_inline uint32_t __builtin_clz(uint32_t i)
+{
+	unsigned long r = 0;
+	_BitScanReverse(&r, i);
+	return (uint32_t)r;
+}
+
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_TYPES_H__ */
author	Brecht Van Lommel <brechtvanlommel@pandora.be>	2013-06-18 13:36:06 +0400
committer	Brecht Van Lommel <brechtvanlommel@pandora.be>	2013-06-18 13:36:06 +0400
commit	d57c6748c4ebb37246caf25d4900ef6d5c16c0fe (patch)
tree	08491bec3d7310f7df1e2171c8fb44a68d508a90 /intern/cycles/util/util_types.h
parent	9131adca9f748f794c18c71d36f830a961c218b4 (diff)