diff options
author | Martijn Berger <martijn.berger@gmail.com> | 2013-11-22 17:16:47 +0400 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2013-11-22 17:42:41 +0400 |
commit | e3a79258d17e6cdca26120eab7a2c48c7c4d4a0f (patch) | |
tree | 77d59694458125dd7525faf59ed56ce505533981 /intern/cycles/util/util_math.h | |
parent | 5feb0d2bfe8f6723bf48073b1760b732bc6a5ceb (diff) |
Cycles: test code for sse 4.1 kernel and alignment for some vector types.
This is mostly work towards enabling the __KERNEL_SSE__ option to start using
SIMD operations for vector math operations. This 4.1 kernel performes about 8%
faster with that option but overall is still slower than without the option.
WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 is the cmake flag for testing this kernel.
Alignment of int3, int4, float3, float4 to 16 bytes seems to give a slight 1-2%
speedup on tested systems with the current kernel already, so is enabled now.
Diffstat (limited to 'intern/cycles/util/util_math.h')
-rw-r--r-- | intern/cycles/util/util_math.h | 13 |
1 files changed, 13 insertions, 0 deletions
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 6db532faf74..851c67b1189 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -462,7 +462,11 @@ ccl_device_inline float3 operator/=(float3& a, float f) ccl_device_inline float dot(const float3 a, const float3 b) { +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); +#else return a.x*b.x + a.y*b.y + a.z*b.z; +#endif } ccl_device_inline float3 cross(const float3 a, const float3 b) @@ -475,7 +479,11 @@ ccl_device_inline float3 cross(const float3 a, const float3 b) ccl_device_inline float len(const float3 a) { +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F))); +#else return sqrtf(dot(a, a)); +#endif } ccl_device_inline float len_squared(const float3 a) @@ -487,7 +495,12 @@ ccl_device_inline float len_squared(const float3 a) ccl_device_inline float3 normalize(const float3 a) { +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); + return _mm_div_ps(a.m128, norm); +#else return a/len(a); +#endif } #endif |