Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSv. Lockal <lockalsash@gmail.com>2014-03-23 00:45:48 +0400
committerSv. Lockal <lockalsash@gmail.com>2014-03-23 00:45:59 +0400
commitc45c472e1b373c5125955056bcf3dd9b5edb8d18 (patch)
tree7fe494f21c8c62152e62f54b0bac1e11315c3b61 /intern/cycles/util/util_simd.h
parent0ef416722ebce6b87158429580a55cf729ebb020 (diff)
Cycles: SSE optimization for line segments/ribbons hair
Gives ~11% speedup for hair.blend, ~10% for koro_final.blend Also extract few common subexpressions in hair calculation. Reviewed By: brecht Differential Revision: https://developer.blender.org/D318
Diffstat (limited to 'intern/cycles/util/util_simd.h')
-rw-r--r--intern/cycles/util/util_simd.h52
1 files changed, 52 insertions, 0 deletions
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index fff682bb436..486816cc5c0 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -154,6 +154,12 @@ ccl_device_inline const __m128 fms(const __m128& a, const __m128& b, const __m12
return _mm_sub_ps(_mm_mul_ps(a, b), c);
}
+/* calculate -a*b+c (replacement for fused negated-multiply-subtract on SSE CPUs) */
+ccl_device_inline const __m128 fnma(const __m128& a, const __m128& b, const __m128& c)
+{
+ return _mm_sub_ps(c, _mm_mul_ps(a, b));
+}
+
template<size_t N> ccl_device_inline const __m128 broadcast(const __m128& a)
{
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(N, N, N, N)));
@@ -204,6 +210,52 @@ ccl_device_inline const __m128 load_m128(const float3 &vec)
}
#endif /* __KERNEL_WITH_SSE_ALIGN__ */
+ccl_device_inline const __m128 dot3_splat(const __m128& a, const __m128& b)
+{
+#ifdef __KERNEL_SSE41__
+ return _mm_dp_ps(a, b, 0x7f);
+#else
+ __m128 t = _mm_mul_ps(a, b);
+ return _mm_set1_ps(((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]);
+#endif
+}
+
+ccl_device_inline float dot3(const __m128& a, const __m128& b)
+{
+#ifdef __KERNEL_SSE41__
+ return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f));
+#else
+ __m128 t = _mm_mul_ps(a, b);
+ return ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
+#endif
+}
+
+ccl_device_inline const __m128 len3_squared_splat(const __m128& a)
+{
+ return dot3_splat(a, a);
+}
+
+ccl_device_inline float len3_squared(const __m128& a)
+{
+ return dot3(a, a);
+}
+
+ccl_device_inline float len3(const __m128& a)
+{
+ return _mm_cvtss_f32(_mm_sqrt_ss(dot3_splat(a, a)));
+}
+
+/* calculate shuffled cross product, useful when order of components does not matter */
+ccl_device_inline const __m128 cross_zxy(const __m128& a, const __m128& b)
+{
+ return fms(a, shuffle<1, 2, 0, 3>(b), _mm_mul_ps(b, shuffle<1, 2, 0, 3>(a)));
+}
+
+ccl_device_inline const __m128 cross(const __m128& a, const __m128& b)
+{
+ return shuffle<1, 2, 0, 3>(cross_zxy(a, b));
+}
+
#endif /* __KERNEL_SSE2__ */
CCL_NAMESPACE_END