Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@gmail.com>2017-07-19 02:54:56 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2017-08-07 15:01:24 +0300
commitee77c1e917dd5e9d8eaca6212376a11aef1a877d (patch)
tree913bb920bd8ac48c392f63b2cebd0801430e3218 /intern/cycles/kernel/filter
parenta24fbf3323101cd35332161b12a04e687b5583e4 (diff)
Code refactor: use float4 instead of intrinsics for CPU denoise filtering.
Differential Revision: https://developer.blender.org/D2764
Diffstat (limited to 'intern/cycles/kernel/filter')
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h80
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h18
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h16
3 files changed, 48 insertions, 66 deletions
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 27e220923a0..3ddd8712266 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
-#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
* pixel_buffer always points to the first of the 4 current pixel in the first pass.
@@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN
#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- __m128 y4 = _mm_set1_ps(pixel.y); \
+ float4 y4 = make_float4(pixel.y); \
for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
- __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
- __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+ float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+ int4 active_pixels = x4 < make_float4(high.x);
#define END_FOR_PIXEL_WINDOW_SSE } \
pixel_buffer += buffer_w - (pixel.x - low.x); \
}
-ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
- __m128 active_pixels,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+ int4 active_pixels,
const float *ccl_restrict buffer,
- __m128 *features,
- const __m128 *ccl_restrict mean,
+ float4 *features,
+ const float4 *ccl_restrict mean,
int pass_stride)
{
features[0] = x;
features[1] = y;
- features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+ features[2] = fabs(ccl_get_feature_sse(0));
features[3] = ccl_get_feature_sse(1);
features[4] = ccl_get_feature_sse(2);
features[5] = ccl_get_feature_sse(3);
@@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
features[9] = ccl_get_feature_sse(7);
if(mean) {
for(int i = 0; i < DENOISE_FEATURES; i++)
- features[i] = _mm_sub_ps(features[i], mean[i]);
+ features[i] = features[i] - mean[i];
}
for(int i = 0; i < DENOISE_FEATURES; i++)
- features[i] = _mm_mask_ps(features[i], active_pixels);
+ features[i] = mask(active_pixels, features[i]);
}
-ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
- __m128 active_pixels,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+ int4 active_pixels,
const float *ccl_restrict buffer,
- __m128 *scales,
- const __m128 *ccl_restrict mean,
+ float4 *scales,
+ const float4 *ccl_restrict mean,
int pass_stride)
{
- scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
- scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
-
- scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
-
- __m128 diff, scale;
- diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
- scale = _mm_mul_ps(diff, diff);
- diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
- scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
- diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
- scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
- scales[3] = _mm_mask_ps(scale, active_pixels);
-
- scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
-
- diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
- scale = _mm_mul_ps(diff, diff);
- diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
- scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
- diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
- scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
- scales[5] = _mm_mask_ps(scale, active_pixels);
+ scales[0] = fabs(x - mean[0]);
+ scales[1] = fabs(y - mean[1]);
+ scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+ scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
+ sqr(ccl_get_feature_sse(2) - mean[4]) +
+ sqr(ccl_get_feature_sse(3) - mean[5]);
+ scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+ scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
+ sqr(ccl_get_feature_sse(6) - mean[8]) +
+ sqr(ccl_get_feature_sse(7) - mean[9]);
+ for(int i = 0; i < 6; i++)
+ scales[i] = mask(active_pixels, scales[i]);
}
-ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
{
- scale[0] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
- scale[1] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
- scale[2] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
- scale[6] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
-
- scale[7] = scale[8] = scale[9] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
- scale[3] = scale[4] = scale[5] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+ scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+ scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+ scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+ scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+ scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+ scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
}
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 3e752bce68f..5e989331bc2 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
int w,
int f)
{
-#ifdef __KERNEL_SSE3__
- int aligned_lowx = (rect.x & ~(3));
- int aligned_highx = ((rect.z + 3) & ~(3));
-#endif
+ int aligned_lowx = rect.x / 4;
+ int aligned_highx = (rect.z + 3) / 4;
for(int y = rect.y; y < rect.w; y++) {
const int low = max(rect.y, y-f);
const int high = min(rect.w, y+f+1);
@@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
out_image[y*w+x] = 0.0f;
}
for(int y1 = low; y1 < high; y1++) {
-#ifdef __KERNEL_SSE3__
- for(int x = aligned_lowx; x < aligned_highx; x+=4) {
- _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+ float4* out_image4 = (float4*)(out_image + y*w);
+ float4* difference_image4 = (float4*)(difference_image + y1*w);
+ for(int x = aligned_lowx; x < aligned_highx; x++) {
+ out_image4[x] += difference_image4[x];
}
-#else
- for(int x = rect.x; x < rect.z; x++) {
- out_image[y*w+x] += difference_image[y1*w+x];
- }
-#endif
}
for(int x = rect.x; x < rect.z; x++) {
out_image[y*w+x] *= 1.0f/(high - low);
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 30dc2969b11..9e65f61664b 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
{
int buffer_w = align_up(rect.z - rect.x, 4);
- __m128 features[DENOISE_FEATURES];
+ float4 features[DENOISE_FEATURES];
const float *ccl_restrict pixel_buffer;
int2 pixel;
@@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
min(rect.w, y + radius + 1));
int num_pixels = (high.y - low.y) * (high.x - low.x);
- __m128 feature_means[DENOISE_FEATURES];
+ float4 feature_means[DENOISE_FEATURES];
math_vector_zero_sse(feature_means, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
} END_FOR_PIXEL_WINDOW_SSE
- __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+ float4 pixel_scale = make_float4(1.0f / num_pixels);
for(int i = 0; i < DENOISE_FEATURES; i++) {
- feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+ feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
}
- __m128 feature_scale[DENOISE_FEATURES];
+ float4 feature_scale[DENOISE_FEATURES];
math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
@@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
filter_calculate_scale_sse(feature_scale);
- __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+ float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
- math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+ math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
} END_FOR_PIXEL_WINDOW_SSE
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
@@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < DENOISE_FEATURES; i++) {
- math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+ math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
}
}