7 files changed, 138 insertions, 148 deletions
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index 53d703de143..6226ed2c2ef 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -78,16 +78,10 @@ ccl_device_inline void filter_calculate_scale(float *scale)
 	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
 }
 
-ccl_device_inline float3 filter_get_pixel_color(const ccl_global float *ccl_restrict buffer,
-                                                int pass_stride)
+ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
+                                          int pass_stride)
 {
-	return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
-}
-
-ccl_device_inline float filter_get_pixel_variance(const ccl_global float *ccl_restrict buffer,
-                                                  int pass_stride)
-{
-	return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
+	return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
 }
 
 ccl_device_inline void design_row_add(float *design_row,
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 3185330994c..3ddd8712266 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
@@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN
 
 #define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
                                  for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                     __m128 y4 = _mm_set1_ps(pixel.y); \
+                                     float4 y4 = make_float4(pixel.y); \
                                      for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-                                         __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
-                                         __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+                                         float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+                                         int4 active_pixels = x4 < make_float4(high.x);
 
 #define END_FOR_PIXEL_WINDOW_SSE     } \
                                      pixel_buffer += buffer_w - (pixel.x - low.x); \
                                  }
 
-ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
-                                               __m128 active_pixels,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+                                               int4 active_pixels,
                                                const float *ccl_restrict buffer,
-                                               __m128 *features,
-                                               const __m128 *ccl_restrict mean,
+                                               float4 *features,
+                                               const float4 *ccl_restrict mean,
                                                int pass_stride)
 {
 	features[0] = x;
 	features[1] = y;
-	features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+	features[2] = fabs(ccl_get_feature_sse(0));
 	features[3] = ccl_get_feature_sse(1);
 	features[4] = ccl_get_feature_sse(2);
 	features[5] = ccl_get_feature_sse(3);
@@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
 	features[9] = ccl_get_feature_sse(7);
 	if(mean) {
 		for(int i = 0; i < DENOISE_FEATURES; i++)
-			features[i] = _mm_sub_ps(features[i], mean[i]);
+			features[i] = features[i] - mean[i];
 	}
 	for(int i = 0; i < DENOISE_FEATURES; i++)
-		features[i] = _mm_mask_ps(features[i], active_pixels);
+		features[i] = mask(active_pixels, features[i]);
 }
 
-ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
-                                                     __m128 active_pixels,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+                                                     int4 active_pixels,
                                                      const float *ccl_restrict buffer,
-                                                     __m128 *scales,
-                                                     const __m128 *ccl_restrict mean,
+                                                     float4 *scales,
+                                                     const float4 *ccl_restrict mean,
                                                      int pass_stride)
 {
-	scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
-	scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
-
-	scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
-
-	__m128 diff, scale;
-	diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
-	scale = _mm_mul_ps(diff, diff);
-	diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	scales[3] = _mm_mask_ps(scale, active_pixels);
-
-	scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
-
-	diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
-	scale = _mm_mul_ps(diff, diff);
-	diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	scales[5] = _mm_mask_ps(scale, active_pixels);
+	scales[0] = fabs(x - mean[0]);
+	scales[1] = fabs(y - mean[1]);
+	scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+	scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
+	            sqr(ccl_get_feature_sse(2) - mean[4]) +
+	            sqr(ccl_get_feature_sse(3) - mean[5]);
+	scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+	scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
+	            sqr(ccl_get_feature_sse(6) - mean[8]) +
+	            sqr(ccl_get_feature_sse(7) - mean[9]);
+	for(int i = 0; i < 6; i++)
+		scales[i] = mask(active_pixels, scales[i]);
 }
 
-ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
 {
-	scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
-	scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
-	scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
-	scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
-
-	scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
-	scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+	scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+	scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+	scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+	scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+	scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+	scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
 }
 
 
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 5cb4038bc33..5e989331bc2 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                                               int w,
                                               int f)
 {
-#ifdef __KERNEL_SSE3__
-	int aligned_lowx = (rect.x & ~(3));
-	int aligned_highx = ((rect.z + 3) & ~(3));
-#endif
+	int aligned_lowx = rect.x / 4;
+	int aligned_highx = (rect.z + 3) / 4;
 	for(int y = rect.y; y < rect.w; y++) {
 		const int low = max(rect.y, y-f);
 		const int high = min(rect.w, y+f+1);
@@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
 			out_image[y*w+x] = 0.0f;
 		}
 		for(int y1 = low; y1 < high; y1++) {
-#ifdef __KERNEL_SSE3__
-			for(int x = aligned_lowx; x < aligned_highx; x+=4) {
-				_mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+			float4* out_image4 = (float4*)(out_image + y*w);
+			float4* difference_image4 = (float4*)(difference_image + y1*w);
+			for(int x = aligned_lowx; x < aligned_highx; x++) {
+				out_image4[x] += difference_image4[x];
 			}
-#else
-			for(int x = rect.x; x < rect.z; x++) {
-				out_image[y*w+x] += difference_image[y1*w+x];
-			}
-#endif
 		}
 		for(int x = rect.x; x < rect.z; x++) {
 			out_image[y*w+x] *= 1.0f/(high - low);
@@ -101,7 +95,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
 		for(int x = rect.x; x < rect.z; x++) {
 			const int low = max(rect.x, x-f);
 			const int high = min(rect.z, x+f+1);
-			out_image[y*w+x] = expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+			out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
 		}
 	}
 }
@@ -133,8 +127,6 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
 ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
                                                            const float *ccl_restrict difference_image,
                                                            const float *ccl_restrict buffer,
-                                                           float *color_pass,
-                                                           float *variance_pass,
                                                            float *transform,
                                                            int *rank,
                                                            float *XtWX,
@@ -167,7 +159,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
 			                                dx, dy, w, h,
 			                                pass_stride,
 			                                buffer,
-			                                color_pass, variance_pass,
 			                                l_transform, l_rank,
 			                                weight, l_XtWX, l_XtWY, 0);
 		}
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 078c5f56763..2c5ac807051 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -66,7 +66,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
 		sum += difference_image[y*w+x1];
 	}
 	sum *= 1.0f/(high-low);
-	out_image[y*w+x] = expf(-max(sum, 0.0f));
+	out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
 }
 
 ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
@@ -97,8 +97,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
                                                            int dx, int dy,
                                                            const ccl_global float *ccl_restrict difference_image,
                                                            const ccl_global float *ccl_restrict buffer,
-                                                           ccl_global float *color_pass,
-                                                           ccl_global float *variance_pass,
                                                            const ccl_global float *ccl_restrict transform,
                                                            ccl_global int *rank,
                                                            ccl_global float *XtWX,
@@ -130,7 +128,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
 	                                dx, dy, w, h,
 	                                pass_stride,
 	                                buffer,
-	                                color_pass, variance_pass,
 	                                transform, rank,
 	                                weight, XtWX, XtWY,
 	                                localIdx);
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index 82cc36625ec..2aeb54a62be 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -61,8 +61,8 @@ ccl_device void kernel_filter_divide_shadow(int sample,
 		varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
 		varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
 	}
-	varA /= (odd_sample - 1);
-	varB /= (even_sample - 1);
+	varA /= max(odd_sample - 1, 1);
+	varB /= max(even_sample - 1, 1);
 
 	sampleVariance[idx]  = 0.5f*(varA + varB) / sample;
 	sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
@@ -96,11 +96,17 @@ ccl_device void kernel_filter_get_feature(int sample,
 	int idx = (y-rect.y)*buffer_w + (x - rect.x);
 
 	mean[idx] = center_buffer[m_offset] / sample;
-	if(use_split_variance) {
-		variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+	if(sample > 1) {
+		if(use_split_variance) {
+			variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+		}
+		else {
+			variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+		}
 	}
 	else {
-		variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+		/* Can't compute variance with single sample, just set it very high. */
+		variance[idx] = 1e10f;
 	}
 }
 
@@ -114,41 +120,57 @@ ccl_device void kernel_filter_detect_outliers(int x, int y,
 {
 	int buffer_w = align_up(rect.z - rect.x, 4);
 
-	int n = 0;
-	float values[25];
-	for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
-		for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
-			int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
-			float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
-
-			/* Find the position of L. */
-			int i;
-			for(i = 0; i < n; i++) {
-				if(values[i] > L) break;
-			}
-			/* Make space for L by shifting all following values to the right. */
-			for(int j = n; j > i; j--) {
-				values[j] = values[j-1];
-			}
-			/* Insert L. */
-			values[i] = L;
-			n++;
-		}
-	}
-
 	int idx = (y-rect.y)*buffer_w + (x-rect.x);
-	float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+	float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
 
-	float ref = 2.0f*values[(int)(n*0.75f)];
 	float fac = 1.0f;
-	if(L > ref) {
-		/* If the pixel is an outlier, negate the depth value to mark it as one.
-		 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+	if(color.x < 0.0f || color.y < 0.0f || color.z < 0.0f) {
 		depth[idx] = -depth[idx];
-		fac = ref/L;
-		variance[idx              ] *= fac*fac;
-		variance[idx + pass_stride] *= fac*fac;
-		variance[idx+2*pass_stride] *= fac*fac;
+		fac = 0.0f;
+	}
+	else {
+		float L = average(color);
+		int n = 0;
+		float values[25];
+		for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+			for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+				int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+				float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+				/* Find the position of L. */
+				int i;
+				for(i = 0; i < n; i++) {
+					if(values[i] > L) break;
+				}
+				/* Make space for L by shifting all following values to the right. */
+				for(int j = n; j > i; j--) {
+					values[j] = values[j-1];
+				}
+				/* Insert L. */
+				values[i] = L;
+				n++;
+			}
+		}
+
+		float ref = 2.0f*values[(int)(n*0.75f)];
+		if(L > ref) {
+			/* The pixel appears to be an outlier.
+			 * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+			 * should actually be at the reference value:
+			 * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+			 * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+			 */
+			float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+			if(L - 3*stddev < ref) {
+				/* The pixel is an outlier, so negate the depth value to mark it as one.
+				 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+				depth[idx] = -depth[idx];
+				fac = ref/L;
+				variance[idx              ] *= fac*fac;
+				variance[idx + pass_stride] *= fac*fac;
+				variance[idx+2*pass_stride] *= fac*fac;
+			}
+		}
 	}
 	out[idx              ] = fac*image[idx];
 	out[idx + pass_stride] = fac*image[idx + pass_stride];
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index 4a4c81b7ba3..25a3025056c 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -22,8 +22,6 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        int w, int h,
                                                        int pass_stride,
                                                        const ccl_global float *ccl_restrict buffer,
-                                                       ccl_global float *color_pass,
-                                                       ccl_global float *variance_pass,
                                                        const ccl_global float *ccl_restrict transform,
                                                        ccl_global int *rank,
                                                        float weight,
@@ -31,38 +29,31 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        ccl_global float3 *XtWY,
                                                        int localIdx)
 {
+	if(weight < 1e-3f) {
+		return;
+	}
+
 	int p_offset =  y    *w +  x;
 	int q_offset = (y+dy)*w + (x+dx);
 
-#ifdef __KERNEL_CPU__
-	const int stride = 1;
-	(void)storage_stride;
-	(void)localIdx;
-	float design_row[DENOISE_FEATURES+1];
-#elif defined(__KERNEL_CUDA__)
+#ifdef __KERNEL_GPU__
 	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
 	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
 	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
 #else
-	const int stride = storage_stride;
 	float design_row[DENOISE_FEATURES+1];
 #endif
 
-	float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
-	float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
+	float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
 
-	float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
-	float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
-
-	/* If the pixel was flagged as an outlier during prefiltering, skip it.
-	 * Otherwise, perform the regular confidence interval test unless
-	 * the center pixel is an outlier (in that case, using the confidence
-	 * interval test could result in no pixels being used at all). */
-	bool p_outlier = (ccl_get_feature(buffer + p_offset, 0) < 0.0f);
-	bool q_outlier = (ccl_get_feature(buffer + q_offset, 0) < 0.0f);
-	bool outside_of_interval = (average(fabs(p_color - q_color)) > 2.0f*(p_std_dev + q_std_dev + 1e-3f));
-
-	if(q_outlier || (!p_outlier && outside_of_interval)) {
+	/* If the pixel was flagged as an outlier during prefiltering, skip it. */
+	if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
 		return;
 	}
 
@@ -83,13 +74,19 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
                                               int4 buffer_params,
                                               int sample)
 {
-#ifdef __KERNEL_CPU__
-	const int stride = 1;
-	(void)storage_stride;
-#else
+#ifdef __KERNEL_GPU__
 	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
 #endif
 
+	if(XtWX[0] < 1e-3f) {
+		/* There is not enough information to determine a denoised result.
+		 * As a fallback, keep the original value of the pixel. */
+		 return;
+	}
+
 	/* The weighted average of pixel colors (essentially, the NLM-filtered image).
 	 * In case the solution of the linear model fails due to numerical issues,
 	 * fall back to this value. */
@@ -102,6 +99,9 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
 		final_color = mean_color;
 	}
 
+	/* Clamp pixel value to positive values. */
+	final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
 	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
 	final_color *= sample;
 	if(buffer_params.w) {
@@ -114,6 +114,4 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
 	combined_buffer[2] = final_color.z;
 }
 
-#undef STORAGE_TYPE
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 30dc2969b11..9e65f61664b 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 {
 	int buffer_w = align_up(rect.z - rect.x, 4);
 
-	__m128 features[DENOISE_FEATURES];
+	float4 features[DENOISE_FEATURES];
 	const float *ccl_restrict pixel_buffer;
 	int2 pixel;
 
@@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 	                      min(rect.w, y + radius + 1));
 	int num_pixels = (high.y - low.y) * (high.x - low.x);
 
-	__m128 feature_means[DENOISE_FEATURES];
+	float4 feature_means[DENOISE_FEATURES];
 	math_vector_zero_sse(feature_means, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
 		math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
 	} END_FOR_PIXEL_WINDOW_SSE
 
-	__m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+	float4 pixel_scale = make_float4(1.0f / num_pixels);
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
-		feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+		feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
 	}
 
-	__m128 feature_scale[DENOISE_FEATURES];
+	float4 feature_scale[DENOISE_FEATURES];
 	math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
@@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
 	filter_calculate_scale_sse(feature_scale);
 
-	__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+	float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
 	math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
 		math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
-		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
 	} END_FOR_PIXEL_WINDOW_SSE
 
 	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
@@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
 	/* Bake the feature scaling into the transformation matrix. */
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
-		math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+		math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
 	}
 }