diff options
author | Sv. Lockal <lockalsash@gmail.com> | 2014-03-23 00:45:48 +0400 |
---|---|---|
committer | Sv. Lockal <lockalsash@gmail.com> | 2014-03-23 00:45:59 +0400 |
commit | c45c472e1b373c5125955056bcf3dd9b5edb8d18 (patch) | |
tree | 7fe494f21c8c62152e62f54b0bac1e11315c3b61 /intern/cycles/util/util_simd.h | |
parent | 0ef416722ebce6b87158429580a55cf729ebb020 (diff) |
Cycles: SSE optimization for line segments/ribbons hair
Gives ~11% speedup for hair.blend, ~10% for koro_final.blend
Also extract few common subexpressions in hair calculation.
Reviewed By: brecht
Differential Revision: https://developer.blender.org/D318
Diffstat (limited to 'intern/cycles/util/util_simd.h')
-rw-r--r-- | intern/cycles/util/util_simd.h | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index fff682bb436..486816cc5c0 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -154,6 +154,12 @@ ccl_device_inline const __m128 fms(const __m128& a, const __m128& b, const __m12 return _mm_sub_ps(_mm_mul_ps(a, b), c); } +/* calculate -a*b+c (replacement for fused negated-multiply-subtract on SSE CPUs) */ +ccl_device_inline const __m128 fnma(const __m128& a, const __m128& b, const __m128& c) +{ + return _mm_sub_ps(c, _mm_mul_ps(a, b)); +} + template<size_t N> ccl_device_inline const __m128 broadcast(const __m128& a) { return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(N, N, N, N))); @@ -204,6 +210,52 @@ ccl_device_inline const __m128 load_m128(const float3 &vec) } #endif /* __KERNEL_WITH_SSE_ALIGN__ */ +ccl_device_inline const __m128 dot3_splat(const __m128& a, const __m128& b) +{ +#ifdef __KERNEL_SSE41__ + return _mm_dp_ps(a, b, 0x7f); +#else + __m128 t = _mm_mul_ps(a, b); + return _mm_set1_ps(((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]); +#endif +} + +ccl_device_inline float dot3(const __m128& a, const __m128& b) +{ +#ifdef __KERNEL_SSE41__ + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f)); +#else + __m128 t = _mm_mul_ps(a, b); + return ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]; +#endif +} + +ccl_device_inline const __m128 len3_squared_splat(const __m128& a) +{ + return dot3_splat(a, a); +} + +ccl_device_inline float len3_squared(const __m128& a) +{ + return dot3(a, a); +} + +ccl_device_inline float len3(const __m128& a) +{ + return _mm_cvtss_f32(_mm_sqrt_ss(dot3_splat(a, a))); +} + +/* calculate shuffled cross product, useful when order of components does not matter */ +ccl_device_inline const __m128 cross_zxy(const __m128& a, const __m128& b) +{ + return fms(a, shuffle<1, 2, 0, 3>(b), _mm_mul_ps(b, shuffle<1, 2, 0, 3>(a))); +} + +ccl_device_inline const __m128 cross(const __m128& a, const __m128& b) +{ + return shuffle<1, 2, 0, 3>(cross_zxy(a, b)); +} + #endif /* __KERNEL_SSE2__ */ CCL_NAMESPACE_END |