From ee77c1e917dd5e9d8eaca6212376a11aef1a877d Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 19 Jul 2017 01:54:56 +0200
Subject: Code refactor: use float4 instead of intrinsics for CPU denoise
 filtering.

Differential Revision: https://developer.blender.org/D2764
---
 intern/cycles/util/util_math_matrix.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'intern/cycles/util/util_math_matrix.h')

diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index c7511f8306e..7269d391956 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 }
 
 #ifdef __KERNEL_SSE3__
-ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
 {
 	for(int i = 0; i < n; i++) {
-		A[i] = _mm_setzero_ps();
+		A[i] = make_float4(0.0f);
 	}
 }
 
-ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_setzero_ps();
+			MAT(A, n, row, col) = make_float4(0.0f);
 		}
 	}
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+			MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
 		}
 	}
 }
 
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_add_ps(V[i], a[i]);
+		V[i] += a[i];
 	}
 }
 
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_mul_ps(V[i], a[i]);
+		V[i] *= a[i];
 	}
 }
 
-ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
 {
 	for(int i = 0; i < n; i++) {
-		a[i] = _mm_max_ps(a[i], b[i]);
+		a[i] = max(a[i], b[i]);
 	}
 }
 
-ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+			MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
 		}
 	}
 }
-- 
cgit v1.2.3