Code refactor: use float4 instead of intrinsics for CPU denoise filtering.

Differential Revision: https://developer.blender.org/D2764
author: Brecht Van Lommel <brechtvanlommel@gmail.com> 2017-07-19 02:54:56 +0300
committer: Brecht Van Lommel <brechtvanlommel@gmail.com> 2017-08-07 15:01:24 +0300
commit: ee77c1e917dd5e9d8eaca6212376a11aef1a877d (patch)
tree: 913bb920bd8ac48c392f63b2cebd0801430e3218 /intern/cycles/util
parent: a24fbf3323101cd35332161b12a04e687b5583e4 (diff)
3 files changed, 20 insertions, 53 deletions
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 007b3fc5082..adb9a76a434 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -52,7 +52,6 @@ ccl_device_inline float4 sqrt(const float4& a);
 ccl_device_inline float4 sqr(const float4& a);
 ccl_device_inline float4 cross(const float4& a, const float4& b);
 ccl_device_inline bool is_zero(const float4& a);
-ccl_device_inline float reduce_add(const float4& a);
 ccl_device_inline float average(const float4& a);
 ccl_device_inline float len(const float4& a);
 ccl_device_inline float4 normalize(const float4& a);
@@ -85,6 +84,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b);
 ccl_device_inline float4 reduce_min(const float4& a);
 ccl_device_inline float4 reduce_max(const float4& a);
+ccl_device_inline float4 reduce_add(const float4& a);
 #endif  /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -275,24 +275,24 @@ ccl_device_inline bool is_zero(const float4& a)
 #endif
 }
 
-ccl_device_inline float reduce_add(const float4& a)
+ccl_device_inline float4 reduce_add(const float4& a)
 {
 #ifdef __KERNEL_SSE__
 #  ifdef __KERNEL_SSE3__
     float4 h(_mm_hadd_ps(a.m128, a.m128));
-    return  _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
+    return float4( _mm_hadd_ps(h.m128, h.m128));
 #  else
 	float4 h(shuffle<1,0,3,2>(a) + a);
-	return  _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+	return  shuffle<2,3,0,1>(h) + h;
 #  endif
 #else
-	return ((a.x + a.y) + (a.z + a.w));
+	return make_float4(((a.x + a.y) + (a.z + a.w)));
 #endif
 }
 
 ccl_device_inline float average(const float4& a)
 {
-	return reduce_add(a) * 0.25f;
+	return reduce_add(a)[0] * 0.25f;
 }
 
 ccl_device_inline float len(const float4& a)
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index c7511f8306e..7269d391956 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 }
 
 #ifdef __KERNEL_SSE3__
-ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
 {
 	for(int i = 0; i < n; i++) {
-		A[i] = _mm_setzero_ps();
+		A[i] = make_float4(0.0f);
 	}
 }
 
-ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_setzero_ps();
+			MAT(A, n, row, col) = make_float4(0.0f);
 		}
 	}
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+			MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
 		}
 	}
 }
 
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_add_ps(V[i], a[i]);
+		V[i] += a[i];
 	}
 }
 
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_mul_ps(V[i], a[i]);
+		V[i] *= a[i];
 	}
 }
 
-ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
 {
 	for(int i = 0; i < n; i++) {
-		a[i] = _mm_max_ps(a[i], b[i]);
+		a[i] = max(a[i], b[i]);
 	}
 }
 
-ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+			MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
 		}
 	}
 }
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 66dd80420ae..a2b3247b207 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -417,39 +417,6 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
 
 #endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
-#undef _mm_fabs_ps
-#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
-
-/* Return a __m128 with every element set to the largest element of v. */
-ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
-{
-  /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
-  v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
-  /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
-  v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
-  return v;
-}
-
-/* Return the sum of the four elements of x. */
-ccl_device_inline float _mm_hsum_ss(__m128 x)
-{
-    __m128 a = _mm_movehdup_ps(x);
-    __m128 b = _mm_add_ps(x, a);
-    return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
-}
-
-/* Return a __m128 with every element set to the sum of the four elements of x. */
-ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
-{
-    x = _mm_hadd_ps(x, x);
-    x = _mm_hadd_ps(x, x);
-    return x;
-}
-
-/* Replace elements of x with zero where mask isn't set. */
-#undef _mm_mask_ps
-#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
-
 #else  /* __KERNEL_SSE2__ */
 
 /* This section is for utility functions which operates on non-register data
author	Brecht Van Lommel <brechtvanlommel@gmail.com>	2017-07-19 02:54:56 +0300
committer	Brecht Van Lommel <brechtvanlommel@gmail.com>	2017-08-07 15:01:24 +0300
commit	ee77c1e917dd5e9d8eaca6212376a11aef1a877d (patch)
tree	913bb920bd8ac48c392f63b2cebd0801430e3218 /intern/cycles/util
parent	a24fbf3323101cd35332161b12a04e687b5583e4 (diff)