diff options
author | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-07-19 02:54:56 +0300 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-08-07 15:01:24 +0300 |
commit | ee77c1e917dd5e9d8eaca6212376a11aef1a877d (patch) | |
tree | 913bb920bd8ac48c392f63b2cebd0801430e3218 | |
parent | a24fbf3323101cd35332161b12a04e687b5583e4 (diff) |
Code refactor: use float4 instead of intrinsics for CPU denoise filtering.
Differential Revision: https://developer.blender.org/D2764
-rw-r--r-- | intern/cycles/kernel/filter/filter_features_sse.h | 80 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_cpu.h | 18 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform_sse.h | 16 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/filter_sse2.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/filter_sse3.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/filter_sse41.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/util/util_math_float4.h | 12 | ||||
-rw-r--r-- | intern/cycles/util/util_math_matrix.h | 28 | ||||
-rw-r--r-- | intern/cycles/util/util_simd.h | 33 |
9 files changed, 71 insertions, 119 deletions
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 27e220923a0..3ddd8712266 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) +#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride) /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. * pixel_buffer always points to the first of the 4 current pixel in the first pass. @@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN #define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - __m128 y4 = _mm_set1_ps(pixel.y); \ + float4 y4 = make_float4(pixel.y); \ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ - __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); #define END_FOR_PIXEL_WINDOW_SSE } \ pixel_buffer += buffer_w - (pixel.x - low.x); \ } -ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *features, - const __m128 *ccl_restrict mean, + float4 *features, + const float4 *ccl_restrict mean, int pass_stride) { features[0] = x; features[1] = y; - features[2] = _mm_fabs_ps(ccl_get_feature_sse(0)); + features[2] = fabs(ccl_get_feature_sse(0)); features[3] = ccl_get_feature_sse(1); features[4] = ccl_get_feature_sse(2); features[5] = ccl_get_feature_sse(3); @@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, features[9] = ccl_get_feature_sse(7); if(mean) { for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_sub_ps(features[i], mean[i]); + features[i] = features[i] - mean[i]; } for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_mask_ps(features[i], active_pixels); + features[i] = mask(active_pixels, features[i]); } -ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *scales, - const __m128 *ccl_restrict mean, + float4 *scales, + const float4 *ccl_restrict mean, int pass_stride) { - scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); - scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); - - scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels); - - __m128 diff, scale; - diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[3] = _mm_mask_ps(scale, active_pixels); - - scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); - - diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[5] = _mm_mask_ps(scale, active_pixels); + scales[0] = fabs(x - mean[0]); + scales[1] = fabs(y - mean[1]); + scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); + scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + + sqr(ccl_get_feature_sse(2) - mean[4]) + + sqr(ccl_get_feature_sse(3) - mean[5]); + scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); + scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + + sqr(ccl_get_feature_sse(6) - mean[8]) + + sqr(ccl_get_feature_sse(7) - mean[9]); + for(int i = 0; i < 6; i++) + scales[i] = mask(active_pixels, scales[i]); } -ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +ccl_device_inline void filter_calculate_scale_sse(float4 *scale) { - scale[0] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); - scale[1] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); - scale[2] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); - scale[6] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); - - scale[7] = scale[8] = scale[9] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); - scale[3] = scale[4] = scale[5] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); + scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); + scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); + scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); + scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); + scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); } diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 3e752bce68f..5e989331bc2 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen int w, int f) { -#ifdef __KERNEL_SSE3__ - int aligned_lowx = (rect.x & ~(3)); - int aligned_highx = ((rect.z + 3) & ~(3)); -#endif + int aligned_lowx = rect.x / 4; + int aligned_highx = (rect.z + 3) / 4; for(int y = rect.y; y < rect.w; y++) { const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); @@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen out_image[y*w+x] = 0.0f; } for(int y1 = low; y1 < high; y1++) { -#ifdef __KERNEL_SSE3__ - for(int x = aligned_lowx; x < aligned_highx; x+=4) { - _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x))); + float4* out_image4 = (float4*)(out_image + y*w); + float4* difference_image4 = (float4*)(difference_image + y1*w); + for(int x = aligned_lowx; x < aligned_highx; x++) { + out_image4[x] += difference_image4[x]; } -#else - for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] += difference_image[y1*w+x]; - } -#endif } for(int x = rect.x; x < rect.z; x++) { out_image[y*w+x] *= 1.0f/(high - low); diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 30dc2969b11..9e65f61664b 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff { int buffer_w = align_up(rect.z - rect.x, 4); - __m128 features[DENOISE_FEATURES]; + float4 features[DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; int2 pixel; @@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff min(rect.w, y + radius + 1)); int num_pixels = (high.y - low.y) * (high.x - low.x); - __m128 feature_means[DENOISE_FEATURES]; + float4 feature_means[DENOISE_FEATURES]; math_vector_zero_sse(feature_means, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); math_vector_add_sse(feature_means, DENOISE_FEATURES, features); } END_FOR_PIXEL_WINDOW_SSE - __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels); + float4 pixel_scale = make_float4(1.0f / num_pixels); for(int i = 0; i < DENOISE_FEATURES; i++) { - feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); + feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; } - __m128 feature_scale[DENOISE_FEATURES]; + float4 feature_scale[DENOISE_FEATURES]; math_vector_zero_sse(feature_scale, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); @@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff filter_calculate_scale_sse(feature_scale); - __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f)); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); } END_FOR_PIXEL_WINDOW_SSE float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; @@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff /* Bake the feature scaling into the transformation matrix. */ for(int i = 0; i < DENOISE_FEATURES; i++) { - math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank); + math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); } } diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp index f7c9935f1d0..a13fb5cd4fb 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp @@ -25,6 +25,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316 */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp index 070b95a3505..6b690adf0f5 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp @@ -25,6 +25,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316 */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp index 1a7b2040da1..254025be4e2 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -25,6 +25,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316 */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index 007b3fc5082..adb9a76a434 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -52,7 +52,6 @@ ccl_device_inline float4 sqrt(const float4& a); ccl_device_inline float4 sqr(const float4& a); ccl_device_inline float4 cross(const float4& a, const float4& b); ccl_device_inline bool is_zero(const float4& a); -ccl_device_inline float reduce_add(const float4& a); ccl_device_inline float average(const float4& a); ccl_device_inline float len(const float4& a); ccl_device_inline float4 normalize(const float4& a); @@ -85,6 +84,7 @@ ccl_device_inline float4 select(const int4& mask, const float4& b); ccl_device_inline float4 reduce_min(const float4& a); ccl_device_inline float4 reduce_max(const float4& a); +ccl_device_inline float4 reduce_add(const float4& a); #endif /* !__KERNEL_GPU__ */ /******************************************************************************* @@ -275,24 +275,24 @@ ccl_device_inline bool is_zero(const float4& a) #endif } -ccl_device_inline float reduce_add(const float4& a) +ccl_device_inline float4 reduce_add(const float4& a) { #ifdef __KERNEL_SSE__ # ifdef __KERNEL_SSE3__ float4 h(_mm_hadd_ps(a.m128, a.m128)); - return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128)); + return float4( _mm_hadd_ps(h.m128, h.m128)); # else float4 h(shuffle<1,0,3,2>(a) + a); - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); + return shuffle<2,3,0,1>(h) + h; # endif #else - return ((a.x + a.y) + (a.z + a.w)); + return make_float4(((a.x + a.y) + (a.z + a.w))); #endif } ccl_device_inline float average(const float4& a) { - return reduce_add(a) * 0.25f; + return reduce_add(a)[0] * 0.25f; } ccl_device_inline float len(const float4& a) diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h index c7511f8306e..7269d391956 100644 --- a/intern/cycles/util/util_math_matrix.h +++ b/intern/cycles/util/util_math_matrix.h @@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float } #ifdef __KERNEL_SSE3__ -ccl_device_inline void math_vector_zero_sse(__m128 *A, int n) +ccl_device_inline void math_vector_zero_sse(float4 *A, int n) { for(int i = 0; i < n; i++) { - A[i] = _mm_setzero_ps(); + A[i] = make_float4(0.0f); } } -ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n) +ccl_device_inline void math_matrix_zero_sse(float4 *A, int n) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_setzero_ps(); + MAT(A, n, row, col) = make_float4(0.0f); } } } /* Add Gramian matrix of v to A. * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ -ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight) +ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight)); + MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight; } } } -ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a) { for(int i = 0; i < n; i++) { - V[i] = _mm_add_ps(V[i], a[i]); + V[i] += a[i]; } } -ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a) { for(int i = 0; i < n; i++) { - V[i] = _mm_mul_ps(V[i], a[i]); + V[i] *= a[i]; } } -ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n) +ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n) { for(int i = 0; i < n; i++) { - a[i] = _mm_max_ps(a[i], b[i]); + a[i] = max(a[i], b[i]); } } -ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B) +ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col)); + MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0]; } } } diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 66dd80420ae..a2b3247b207 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -417,39 +417,6 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) #endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */ -#undef _mm_fabs_ps -#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) - -/* Return a __m128 with every element set to the largest element of v. */ -ccl_device_inline __m128 _mm_hmax_ps(__m128 v) -{ - /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */ - v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v)); - /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */ - v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v)); - return v; -} - -/* Return the sum of the four elements of x. */ -ccl_device_inline float _mm_hsum_ss(__m128 x) -{ - __m128 a = _mm_movehdup_ps(x); - __m128 b = _mm_add_ps(x, a); - return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b)); -} - -/* Return a __m128 with every element set to the sum of the four elements of x. */ -ccl_device_inline __m128 _mm_hsum_ps(__m128 x) -{ - x = _mm_hadd_ps(x, x); - x = _mm_hadd_ps(x, x); - return x; -} - -/* Replace elements of x with zero where mask isn't set. */ -#undef _mm_mask_ps -#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask) - #else /* __KERNEL_SSE2__ */ /* This section is for utility functions which operates on non-register data |