Cycles: SSE optimization for line segments/ribbons hair

Gives ~11% speedup for hair.blend, ~10% for koro_final.blend Also extract few common subexpressions in hair calculation. Reviewed By: brecht Differential Revision: https://developer.blender.org/D318
author: Sv. Lockal <lockalsash@gmail.com> 2014-03-23 00:45:48 +0400
committer: Sv. Lockal <lockalsash@gmail.com> 2014-03-23 00:45:59 +0400
commit: c45c472e1b373c5125955056bcf3dd9b5edb8d18 (patch)
tree: 7fe494f21c8c62152e62f54b0bac1e11315c3b61 /intern/cycles/util/util_simd.h
parent: 0ef416722ebce6b87158429580a55cf729ebb020 (diff)
1 files changed, 52 insertions, 0 deletions
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index fff682bb436..486816cc5c0 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -154,6 +154,12 @@ ccl_device_inline const __m128 fms(const __m128& a, const __m128& b, const __m12
 	return _mm_sub_ps(_mm_mul_ps(a, b), c);
 }
 
+/* calculate -a*b+c (replacement for fused negated-multiply-subtract on SSE CPUs) */
+ccl_device_inline const __m128 fnma(const __m128& a, const __m128& b, const __m128& c)
+{
+	return _mm_sub_ps(c, _mm_mul_ps(a, b));
+}
+
 template<size_t N> ccl_device_inline const __m128 broadcast(const __m128& a)
 {
 	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(N, N, N, N)));
@@ -204,6 +210,52 @@ ccl_device_inline const __m128 load_m128(const float3 &vec)
 }
 #endif /* __KERNEL_WITH_SSE_ALIGN__ */
 
+ccl_device_inline const __m128 dot3_splat(const __m128& a, const __m128& b)
+{
+#ifdef __KERNEL_SSE41__
+	return _mm_dp_ps(a, b, 0x7f);
+#else
+	__m128 t = _mm_mul_ps(a, b);
+	return _mm_set1_ps(((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]);
+#endif
+}
+
+ccl_device_inline float dot3(const __m128& a, const __m128& b)
+{
+#ifdef __KERNEL_SSE41__
+	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f));
+#else
+	__m128 t = _mm_mul_ps(a, b);
+	return ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
+#endif
+}
+
+ccl_device_inline const __m128 len3_squared_splat(const __m128& a)
+{
+	return dot3_splat(a, a);
+}
+
+ccl_device_inline float len3_squared(const __m128& a)
+{
+	return dot3(a, a);
+}
+
+ccl_device_inline float len3(const __m128& a)
+{
+	return _mm_cvtss_f32(_mm_sqrt_ss(dot3_splat(a, a)));
+}
+
+/* calculate shuffled cross product, useful when order of components does not matter */
+ccl_device_inline const __m128 cross_zxy(const __m128& a, const __m128& b)
+{
+	return fms(a, shuffle<1, 2, 0, 3>(b), _mm_mul_ps(b, shuffle<1, 2, 0, 3>(a)));
+}
+
+ccl_device_inline const __m128 cross(const __m128& a, const __m128& b)
+{
+	return shuffle<1, 2, 0, 3>(cross_zxy(a, b));
+}
+
 #endif /* __KERNEL_SSE2__ */
 
 CCL_NAMESPACE_END
author	Sv. Lockal <lockalsash@gmail.com>	2014-03-23 00:45:48 +0400
committer	Sv. Lockal <lockalsash@gmail.com>	2014-03-23 00:45:59 +0400
commit	c45c472e1b373c5125955056bcf3dd9b5edb8d18 (patch)
tree	7fe494f21c8c62152e62f54b0bac1e11315c3b61 /intern/cycles/util/util_simd.h
parent	0ef416722ebce6b87158429580a55cf729ebb020 (diff)