diff options
Diffstat (limited to 'intern/cycles/kernel/filter')
-rw-r--r-- | intern/cycles/kernel/filter/filter_features.h | 12 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_features_sse.h | 80 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_cpu.h | 23 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_gpu.h | 5 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_prefilter.h | 94 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_reconstruction.h | 56 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform_sse.h | 16 |
7 files changed, 138 insertions, 148 deletions
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h index 53d703de143..6226ed2c2ef 100644 --- a/intern/cycles/kernel/filter/filter_features.h +++ b/intern/cycles/kernel/filter/filter_features.h @@ -78,16 +78,10 @@ ccl_device_inline void filter_calculate_scale(float *scale) scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); } -ccl_device_inline float3 filter_get_pixel_color(const ccl_global float *ccl_restrict buffer, - int pass_stride) +ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer, + int pass_stride) { - return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)); -} - -ccl_device_inline float filter_get_pixel_variance(const ccl_global float *ccl_restrict buffer, - int pass_stride) -{ - return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2))); + return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); } ccl_device_inline void design_row_add(float *design_row, diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 3185330994c..3ddd8712266 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) +#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride) /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. * pixel_buffer always points to the first of the 4 current pixel in the first pass. @@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN #define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - __m128 y4 = _mm_set1_ps(pixel.y); \ + float4 y4 = make_float4(pixel.y); \ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ - __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); #define END_FOR_PIXEL_WINDOW_SSE } \ pixel_buffer += buffer_w - (pixel.x - low.x); \ } -ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *features, - const __m128 *ccl_restrict mean, + float4 *features, + const float4 *ccl_restrict mean, int pass_stride) { features[0] = x; features[1] = y; - features[2] = _mm_fabs_ps(ccl_get_feature_sse(0)); + features[2] = fabs(ccl_get_feature_sse(0)); features[3] = ccl_get_feature_sse(1); features[4] = ccl_get_feature_sse(2); features[5] = ccl_get_feature_sse(3); @@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, features[9] = ccl_get_feature_sse(7); if(mean) { for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_sub_ps(features[i], mean[i]); + features[i] = features[i] - mean[i]; } for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_mask_ps(features[i], active_pixels); + features[i] = mask(active_pixels, features[i]); } -ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *scales, - const __m128 *ccl_restrict mean, + float4 *scales, + const float4 *ccl_restrict mean, int pass_stride) { - scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); - scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); - - scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels); - - __m128 diff, scale; - diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[3] = _mm_mask_ps(scale, active_pixels); - - scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); - - diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[5] = _mm_mask_ps(scale, active_pixels); + scales[0] = fabs(x - mean[0]); + scales[1] = fabs(y - mean[1]); + scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); + scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + + sqr(ccl_get_feature_sse(2) - mean[4]) + + sqr(ccl_get_feature_sse(3) - mean[5]); + scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); + scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + + sqr(ccl_get_feature_sse(6) - mean[8]) + + sqr(ccl_get_feature_sse(7) - mean[9]); + for(int i = 0; i < 6; i++) + scales[i] = mask(active_pixels, scales[i]); } -ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +ccl_device_inline void filter_calculate_scale_sse(float4 *scale) { - scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); - scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); - scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); - scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); - - scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); - scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); + scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); + scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); + scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); + scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); + scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); } diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 5cb4038bc33..5e989331bc2 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen int w, int f) { -#ifdef __KERNEL_SSE3__ - int aligned_lowx = (rect.x & ~(3)); - int aligned_highx = ((rect.z + 3) & ~(3)); -#endif + int aligned_lowx = rect.x / 4; + int aligned_highx = (rect.z + 3) / 4; for(int y = rect.y; y < rect.w; y++) { const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); @@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen out_image[y*w+x] = 0.0f; } for(int y1 = low; y1 < high; y1++) { -#ifdef __KERNEL_SSE3__ - for(int x = aligned_lowx; x < aligned_highx; x+=4) { - _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x))); + float4* out_image4 = (float4*)(out_image + y*w); + float4* difference_image4 = (float4*)(difference_image + y1*w); + for(int x = aligned_lowx; x < aligned_highx; x++) { + out_image4[x] += difference_image4[x]; } -#else - for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] += difference_image[y1*w+x]; - } -#endif } for(int x = rect.x; x < rect.z; x++) { out_image[y*w+x] *= 1.0f/(high - low); @@ -101,7 +95,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d for(int x = rect.x; x < rect.z; x++) { const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); - out_image[y*w+x] = expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f)); + out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f)); } } } @@ -133,8 +127,6 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, const float *ccl_restrict difference_image, const float *ccl_restrict buffer, - float *color_pass, - float *variance_pass, float *transform, int *rank, float *XtWX, @@ -167,7 +159,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, dx, dy, w, h, pass_stride, buffer, - color_pass, variance_pass, l_transform, l_rank, weight, l_XtWX, l_XtWY, 0); } diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index 078c5f56763..2c5ac807051 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -66,7 +66,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, sum += difference_image[y*w+x1]; } sum *= 1.0f/(high-low); - out_image[y*w+x] = expf(-max(sum, 0.0f)); + out_image[y*w+x] = fast_expf(-max(sum, 0.0f)); } ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, @@ -97,8 +97,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, int dx, int dy, const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, - ccl_global float *color_pass, - ccl_global float *variance_pass, const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, @@ -130,7 +128,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, dx, dy, w, h, pass_stride, buffer, - color_pass, variance_pass, transform, rank, weight, XtWX, XtWY, localIdx); diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 82cc36625ec..2aeb54a62be 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -61,8 +61,8 @@ ccl_device void kernel_filter_divide_shadow(int sample, varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); } - varA /= (odd_sample - 1); - varB /= (even_sample - 1); + varA /= max(odd_sample - 1, 1); + varB /= max(even_sample - 1, 1); sampleVariance[idx] = 0.5f*(varA + varB) / sample; sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); @@ -96,11 +96,17 @@ ccl_device void kernel_filter_get_feature(int sample, int idx = (y-rect.y)*buffer_w + (x - rect.x); mean[idx] = center_buffer[m_offset] / sample; - if(use_split_variance) { - variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + if(sample > 1) { + if(use_split_variance) { + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + } + else { + variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + } } else { - variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; } } @@ -114,41 +120,57 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, { int buffer_w = align_up(rect.z - rect.x, 4); - int n = 0; - float values[25]; - for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { - for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { - int idx = (y1-rect.y)*buffer_w + (x1-rect.x); - float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); - - /* Find the position of L. */ - int i; - for(i = 0; i < n; i++) { - if(values[i] > L) break; - } - /* Make space for L by shifting all following values to the right. */ - for(int j = n; j > i; j--) { - values[j] = values[j-1]; - } - /* Insert L. */ - values[i] = L; - n++; - } - } - int idx = (y-rect.y)*buffer_w + (x-rect.x); - float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); - float ref = 2.0f*values[(int)(n*0.75f)]; float fac = 1.0f; - if(L > ref) { - /* If the pixel is an outlier, negate the depth value to mark it as one. - * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + if(color.x < 0.0f || color.y < 0.0f || color.z < 0.0f) { depth[idx] = -depth[idx]; - fac = ref/L; - variance[idx ] *= fac*fac; - variance[idx + pass_stride] *= fac*fac; - variance[idx+2*pass_stride] *= fac*fac; + fac = 0.0f; + } + else { + float L = average(color); + int n = 0; + float values[25]; + for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { + for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { + int idx = (y1-rect.y)*buffer_w + (x1-rect.x); + float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + + /* Find the position of L. */ + int i; + for(i = 0; i < n; i++) { + if(values[i] > L) break; + } + /* Make space for L by shifting all following values to the right. */ + for(int j = n; j > i; j--) { + values[j] = values[j-1]; + } + /* Insert L. */ + values[i] = L; + n++; + } + } + + float ref = 2.0f*values[(int)(n*0.75f)]; + if(L > ref) { + /* The pixel appears to be an outlier. + * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel + * should actually be at the reference value: + * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. + * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. + */ + float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); + if(L - 3*stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + fac = ref/L; + variance[idx ] *= fac*fac; + variance[idx + pass_stride] *= fac*fac; + variance[idx+2*pass_stride] *= fac*fac; + } + } } out[idx ] = fac*image[idx]; out[idx + pass_stride] = fac*image[idx + pass_stride]; diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index 4a4c81b7ba3..25a3025056c 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -22,8 +22,6 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int w, int h, int pass_stride, const ccl_global float *ccl_restrict buffer, - ccl_global float *color_pass, - ccl_global float *variance_pass, const ccl_global float *ccl_restrict transform, ccl_global int *rank, float weight, @@ -31,38 +29,31 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, ccl_global float3 *XtWY, int localIdx) { + if(weight < 1e-3f) { + return; + } + int p_offset = y *w + x; int q_offset = (y+dy)*w + (x+dx); -#ifdef __KERNEL_CPU__ - const int stride = 1; - (void)storage_stride; - (void)localIdx; - float design_row[DENOISE_FEATURES+1]; -#elif defined(__KERNEL_CUDA__) +#ifdef __KERNEL_GPU__ const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + +#ifdef __KERNEL_CUDA__ ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); #else - const int stride = storage_stride; float design_row[DENOISE_FEATURES+1]; #endif - float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride); - float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride); + float3 q_color = filter_get_color(buffer + q_offset, pass_stride); - float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride)); - float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride)); - - /* If the pixel was flagged as an outlier during prefiltering, skip it. - * Otherwise, perform the regular confidence interval test unless - * the center pixel is an outlier (in that case, using the confidence - * interval test could result in no pixels being used at all). */ - bool p_outlier = (ccl_get_feature(buffer + p_offset, 0) < 0.0f); - bool q_outlier = (ccl_get_feature(buffer + q_offset, 0) < 0.0f); - bool outside_of_interval = (average(fabs(p_color - q_color)) > 2.0f*(p_std_dev + q_std_dev + 1e-3f)); - - if(q_outlier || (!p_outlier && outside_of_interval)) { + /* If the pixel was flagged as an outlier during prefiltering, skip it. */ + if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) { return; } @@ -83,13 +74,19 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, int4 buffer_params, int sample) { -#ifdef __KERNEL_CPU__ - const int stride = 1; - (void)storage_stride; -#else +#ifdef __KERNEL_GPU__ const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; #endif + if(XtWX[0] < 1e-3f) { + /* There is not enough information to determine a denoised result. + * As a fallback, keep the original value of the pixel. */ + return; + } + /* The weighted average of pixel colors (essentially, the NLM-filtered image). * In case the solution of the linear model fails due to numerical issues, * fall back to this value. */ @@ -102,6 +99,9 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, final_color = mean_color; } + /* Clamp pixel value to positive values. */ + final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; final_color *= sample; if(buffer_params.w) { @@ -114,6 +114,4 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, combined_buffer[2] = final_color.z; } -#undef STORAGE_TYPE - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 30dc2969b11..9e65f61664b 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff { int buffer_w = align_up(rect.z - rect.x, 4); - __m128 features[DENOISE_FEATURES]; + float4 features[DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; int2 pixel; @@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff min(rect.w, y + radius + 1)); int num_pixels = (high.y - low.y) * (high.x - low.x); - __m128 feature_means[DENOISE_FEATURES]; + float4 feature_means[DENOISE_FEATURES]; math_vector_zero_sse(feature_means, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); math_vector_add_sse(feature_means, DENOISE_FEATURES, features); } END_FOR_PIXEL_WINDOW_SSE - __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels); + float4 pixel_scale = make_float4(1.0f / num_pixels); for(int i = 0; i < DENOISE_FEATURES; i++) { - feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); + feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; } - __m128 feature_scale[DENOISE_FEATURES]; + float4 feature_scale[DENOISE_FEATURES]; math_vector_zero_sse(feature_scale, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); @@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff filter_calculate_scale_sse(feature_scale); - __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f)); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); } END_FOR_PIXEL_WINDOW_SSE float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; @@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff /* Bake the feature scaling into the transformation matrix. */ for(int i = 0; i < DENOISE_FEATURES; i++) { - math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank); + math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); } } |