diff options
author | Brecht Van Lommel <brechtvanlommel@pandora.be> | 2013-06-18 13:36:06 +0400 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@pandora.be> | 2013-06-18 13:36:06 +0400 |
commit | d57c6748c4ebb37246caf25d4900ef6d5c16c0fe (patch) | |
tree | 08491bec3d7310f7df1e2171c8fb44a68d508a90 /intern/cycles/util/util_types.h | |
parent | 9131adca9f748f794c18c71d36f830a961c218b4 (diff) |
Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.
On the BMW scene, this gives roughly a 10% speedup overall with clang/gcc, and 30%
speedup with visual studio (2008). It turns out visual studio was optimizing the
existing code quite poorly compared to pretty good autovectorization by clang/gcc,
but hand written SSE code also gives a smaller speed boost there.
This code isn't enabled when using the hair minimum width feature yet, need to
make that work with the SSE code still.
Diffstat (limited to 'intern/cycles/util/util_types.h')
-rw-r--r-- | intern/cycles/util/util_types.h | 57 |
1 files changed, 56 insertions, 1 deletions
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 472a707d8fd..d4ff95b0663 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -72,13 +72,21 @@ #include <tmmintrin.h> /* SSE 3 */ #include <smmintrin.h> /* SSE 4 */ +#ifndef __KERNEL_SSE2__ #define __KERNEL_SSE2__ +#endif + +#ifndef __KERNEL_SSE3__ #define __KERNEL_SSE3__ +#endif + +#ifndef __KERNEL_SSE4__ #define __KERNEL_SSE4__ +#endif #else -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__KERNEL_SSE3__) /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. * Since we can't avoid including <windows.h>, better only include that */ @@ -87,9 +95,16 @@ #else #include <xmmintrin.h> /* SSE 1 */ #include <emmintrin.h> /* SSE 2 */ + +#ifdef __KERNEL_SSE3__ +#include <pmmintrin.h> /* SSE 3 */ +#include <tmmintrin.h> /* SSE 3 */ +#endif #endif +#ifndef __KERNEL_SSE2__ #define __KERNEL_SSE2__ +#endif #endif @@ -471,6 +486,46 @@ __device_inline int4 make_int4(const float3& f) #endif +#ifdef __KERNEL_SSE3__ + +/* SSE shuffle utility functions */ + +__device_inline const __m128 shuffle8(const __m128& a, const __m128i& shuf) +{ + return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& a, const __m128& b) +{ + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& b) +{ + return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))); +} +#endif + +#if defined(__KERNEL_SSE2__) && defined(_MSC_VER) + +/* count zeros from start or end of integer bits */ + +__device_inline uint32_t __builtin_ctz(uint32_t i) +{ + unsigned long r = 0; + _BitScanForward(&r, i); + return (uint32_t)r; +} + +__device_inline uint32_t __builtin_clz(uint32_t i) +{ + unsigned long r = 0; + _BitScanReverse(&r, i); + return (uint32_t)r; +} + +#endif + CCL_NAMESPACE_END #endif /* __UTIL_TYPES_H__ */ |