From a24fbf3323101cd35332161b12a04e687b5583e4 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 2 Aug 2017 02:23:03 +0200
Subject: Code refactor: add, remove, optimize various SSE functions.

* Remove some unnecessary SSE emulation defines.
* Use full precision float division so we can enable it.
* Add sqrt(), sqr(), fabs(), shuffle variations, mask().
* Optimize reduce_add(), select().

Differential Revision: https://developer.blender.org/D2764
---
 intern/cycles/util/util_math.h        |  1 +
 intern/cycles/util/util_math_float3.h | 20 +++-----
 intern/cycles/util/util_math_float4.h | 93 ++++++++++++++++++++++++++++-------
 intern/cycles/util/util_simd.h        | 52 ++------------------
 4 files changed, 86 insertions(+), 80 deletions(-)

(limited to 'intern/cycles/util')

diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index b719640b19c..4d51ec5570a 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b)
 #ifndef __KERNEL_GPU__
 using std::isfinite;
 using std::isnan;
+using std::sqrt;
 
 ccl_device_inline int abs(int x)
 {
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index bb04c4aa2d9..e73e5bc17a2 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f)
 
 ccl_device_inline float3 operator*(const float f, const float3& a)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
+#if defined(__KERNEL_SSE__)
 	return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
 #else
 	return make_float3(a.x*f, a.y*f, a.z*f);
@@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a)
 
 ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(a.m128);
-	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
 #else
 	return make_float3(f / a.x, f / a.y, f / a.z);
 #endif
@@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f)
 
 ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(b.m128);
-	return float3(_mm_mul_ps(a, rc));
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(a.m128, b.m128));
 #else
 	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 #endif
@@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
 ccl_device_inline float3 rcp(const float3& a)
 {
 #ifdef __KERNEL_SSE__
-	const float4 r(_mm_rcp_ps(a.m128));
-	return float3(_mm_sub_ps(_mm_add_ps(r, r),
-	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
 	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
 #endif
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index d89121b3a1d..007b3fc5082 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -48,6 +48,8 @@ ccl_device_inline bool operator==(const float4& a, const float4& b);
 ccl_device_inline float dot(const float4& a, const float4& b);
 ccl_device_inline float len_squared(const float4& a);
 ccl_device_inline float4 rcp(const float4& a);
+ccl_device_inline float4 sqrt(const float4& a);
+ccl_device_inline float4 sqr(const float4& a);
 ccl_device_inline float4 cross(const float4& a, const float4& b);
 ccl_device_inline bool is_zero(const float4& a);
 ccl_device_inline float reduce_add(const float4& a);
@@ -57,14 +59,20 @@ ccl_device_inline float4 normalize(const float4& a);
 ccl_device_inline float4 safe_normalize(const float4& a);
 ccl_device_inline float4 min(const float4& a, const float4& b);
 ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 fabs(const float4& a);
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4& b);
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b);
 
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b);
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b);
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b);
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b);
 template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b);
@@ -77,9 +85,6 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b);
 ccl_device_inline float4 reduce_min(const float4& a);
 ccl_device_inline float4 reduce_max(const float4& a);
-#  if 0
-ccl_device_inline float4 reduce_add(const float4& a);
-#  endif
 #endif  /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f)
 ccl_device_inline float4 operator/(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	return a * rcp(b);
+	return float4(_mm_div_ps(a.m128, b.m128));
 #else
 	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
 #endif
@@ -224,14 +229,30 @@ ccl_device_inline float len_squared(const float4& a)
 ccl_device_inline float4 rcp(const float4& a)
 {
 #ifdef __KERNEL_SSE__
-	float4 r(_mm_rcp_ps(a.m128));
-	return float4(_mm_sub_ps(_mm_add_ps(r, r),
-	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
 	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
 #endif
 }
 
+ccl_device_inline float4 sqrt(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_sqrt_ps(a.m128));
+#else
+	return make_float4(sqrtf(a.x),
+	                   sqrtf(a.y),
+	                   sqrtf(a.z),
+	                   sqrtf(a.w));
+#endif
+}
+
+ccl_device_inline float4 sqr(const float4& a)
+{
+	return a * a;
+}
+
 ccl_device_inline float4 cross(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
@@ -257,9 +278,13 @@ ccl_device_inline bool is_zero(const float4& a)
 ccl_device_inline float reduce_add(const float4& a)
 {
 #ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE3__
+    float4 h(_mm_hadd_ps(a.m128, a.m128));
+    return  _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
+#  else
 	float4 h(shuffle<1,0,3,2>(a) + a);
-	/* TODO(sergey): Investigate efficiency. */
-	return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+	return  _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+#  endif
 #else
 	return ((a.x + a.y) + (a.z + a.w));
 #endif
@@ -309,6 +334,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
 	                   max(a.w, b.w));
 #endif
 }
+
+ccl_device_inline float4 fabs(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#else
+	return make_float4(fabsf(a.x),
+	                   fabsf(a.y),
+	                   fabsf(a.z),
+	                   fabsf(a.w));
+#endif
+}
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
@@ -320,11 +357,28 @@ __forceinline const float4 shuffle(const float4& b)
 	                          _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
 }
 
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b)
+{
+	return float4(_mm_shuffle_ps(a.m128, b.m128,
+	                             _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+}
+
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
 {
 	return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
 }
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b)
+{
+	return float4(_mm_movelh_ps(a.m128, b.m128));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b)
+{
+	return float4(_mm_movehl_ps(b.m128, a.m128));
+}
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
 {
@@ -344,9 +398,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	/* TODO(sergey): avoid cvt. */
-	return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a),
-	                        _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)));
+	return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
 #else
 	return make_float4((mask.x)? a.x: b.x,
 	                   (mask.y)? a.y: b.y,
@@ -355,6 +407,13 @@ ccl_device_inline float4 select(const int4& mask,
 #endif
 }
 
+ccl_device_inline float4 mask(const int4& mask,
+                              const float4& a)
+{
+	/* Replace elements of x with zero where mask isn't set. */
+	return select(mask, a, make_float4(0.0f));
+}
+
 ccl_device_inline float4 reduce_min(const float4& a)
 {
 #ifdef __KERNEL_SSE__
@@ -375,17 +434,15 @@ ccl_device_inline float4 reduce_max(const float4& a)
 #endif
 }
 
-#if 0
-ccl_device_inline float4 reduce_add(const float4& a)
+ccl_device_inline float4 load_float4(const float *v)
 {
 #ifdef __KERNEL_SSE__
-	float4 h = shuffle<1,0,3,2>(a) + a;
-	return shuffle<2,3,0,1>(h) + h;
+	return float4(_mm_loadu_ps(v));
 #else
-	return make_float4((a.x + a.y) + (a.z + a.w));
+	return make_float4(v[0], v[1], v[2], v[3]);
 #endif
 }
-#endif
+
 #endif  /* !__KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 7d938a0fbca..66dd80420ae 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -347,12 +347,9 @@ __forceinline size_t __bscf(size_t& v)
 
 #endif /* _WIN32 */
 
-static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
-static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
+#if !(defined(__SSE4_1__) || defined(__SSE4_2__))
 
-#ifdef __KERNEL_SSE3__
-/* Emulation of SSE4 functions with SSE3 */
-#  ifndef __KERNEL_SSE41__
+/* Emulation of SSE4 functions with SSE2 */
 
 #define _MM_FROUND_TO_NEAREST_INT    0x00
 #define _MM_FROUND_TO_NEG_INF        0x01
@@ -361,48 +358,31 @@ static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
 #define _MM_FROUND_CUR_DIRECTION     0x04
 
 #undef _mm_blendv_ps
-#define _mm_blendv_ps __emu_mm_blendv_ps
 __forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { 
     return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); 
 }
 
 #undef _mm_blend_ps
-#define _mm_blend_ps __emu_mm_blend_ps
 __forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { 
     assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); 
 }
 
 #undef _mm_blendv_epi8
-#define _mm_blendv_epi8 __emu_mm_blendv_epi8
 __forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { 
     return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); 
 }
 
-#undef _mm_mullo_epi32
-#define _mm_mullo_epi32 __emu_mm_mullo_epi32
-__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
-  __m128i rvalue;
-  char* _r = (char*)(&rvalue + 1);
-  char* _v = (char*)(& value + 1);
-  char* _i = (char*)(& input + 1);
-  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))*  *((int32_t*)(_i + i));
-  return rvalue;
-}
-
 #undef _mm_min_epi32
-#define _mm_min_epi32 __emu_mm_min_epi32
 __forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_max_epi32
-#define _mm_max_epi32 __emu_mm_max_epi32
 __forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_extract_epi32
-#define _mm_extract_epi32 __emu_mm_extract_epi32
 __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
   switch ( index ) {
   case 0: return _mm_cvtsi128_si32(input);
@@ -414,24 +394,15 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
 }
 
 #undef _mm_insert_epi32
-#define _mm_insert_epi32 __emu_mm_insert_epi32
 __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { 
     assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; 
 }
 
-#undef _mm_extract_ps
-#define _mm_extract_ps __emu_mm_extract_ps
-__forceinline int _mm_extract_ps( __m128 input, const int index ) {
-  int32_t* ptr = (int32_t*)&input; return ptr[index];
-}
-
 #undef _mm_insert_ps
-#define _mm_insert_ps __emu_mm_insert_ps
 __forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
 { assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
 
 #undef _mm_round_ps
-#define _mm_round_ps __emu_mm_round_ps
 __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
 {
   switch ( flags )
@@ -444,22 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
   return value;
 }
 
-#    ifdef _M_X64
-#undef _mm_insert_epi64
-#define _mm_insert_epi64 __emu_mm_insert_epi64
-__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { 
-    assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; 
-}
-
-#undef _mm_extract_epi64
-#define _mm_extract_epi64 __emu_mm_extract_epi64
-__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { 
-    assert(size_t(index) < 2); 
-    return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); 
-}
-#    endif
-
-#  endif
+#endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
 #undef _mm_fabs_ps
 #define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
@@ -494,8 +450,6 @@ ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
 #undef _mm_mask_ps
 #define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
 
-#endif
-
 #else  /* __KERNEL_SSE2__ */
 
 /* This section is for utility functions which operates on non-register data
-- 
cgit v1.2.3