Merge branch 'master' into blender2.8

author: Campbell Barton <ideasman42@gmail.com> 2017-05-20 07:19:05 +0300
committer: Campbell Barton <ideasman42@gmail.com> 2017-05-20 07:19:05 +0300
commit: 65aab6cdae822822122f5e180baf10e39f600bf2 (patch)
tree: fc8556ec5590a0e47c41e2c18131fede4b67732b /intern
parent: 996bf65730257d0a80c6ada03a38d0a321b1e87e (diff)
parent: 81e584ed17902878579131776b4e5a9f7b54cdab (diff)
22 files changed, 352 insertions, 247 deletions
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index 41998c792b6..53d703de143 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -28,7 +28,11 @@
                                  pixel_buffer += buffer_w - (high.x - low.x); \
                              }
 
-ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_features(int2 pixel,
+                                           const ccl_global float *ccl_restrict buffer,
+                                           float *features,
+                                           const float *ccl_restrict mean,
+                                           int pass_stride)
 {
 	features[0] = pixel.x;
 	features[1] = pixel.y;
@@ -46,7 +50,11 @@ ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_rest
 	}
 }
 
-ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_feature_scales(int2 pixel,
+                                                 const ccl_global float *ccl_restrict buffer,
+                                                 float *scales,
+                                                 const float *ccl_restrict mean,
+                                                 int pass_stride)
 {
 	scales[0] = fabsf(pixel.x - mean[0]);
 	scales[1] = fabsf(pixel.y - mean[1]);
@@ -70,19 +78,21 @@ ccl_device_inline void filter_calculate_scale(float *scale)
 	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
 }
 
-ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+ccl_device_inline float3 filter_get_pixel_color(const ccl_global float *ccl_restrict buffer,
+                                                int pass_stride)
 {
 	return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
 }
 
-ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+ccl_device_inline float filter_get_pixel_variance(const ccl_global float *ccl_restrict buffer,
+                                                  int pass_stride)
 {
 	return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
 }
 
 ccl_device_inline void design_row_add(float *design_row,
                                       int rank,
-                                      ccl_global float ccl_restrict_ptr transform,
+                                      const ccl_global float *ccl_restrict transform,
                                       int stride,
                                       int row,
                                       float feature)
@@ -94,13 +104,13 @@ ccl_device_inline void design_row_add(float *design_row,
 
 /* Fill the design row. */
 ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
-                                                       ccl_global float ccl_restrict_ptr p_buffer,
+                                                       const ccl_global float *ccl_restrict p_buffer,
                                                        int2 q_pixel,
-                                                       ccl_global float ccl_restrict_ptr q_buffer,
+                                                       const ccl_global float *ccl_restrict q_buffer,
                                                        int pass_stride,
                                                        int rank,
                                                        float *design_row,
-                                                       ccl_global float ccl_restrict_ptr transform,
+                                                       const ccl_global float *ccl_restrict transform,
                                                        int stride)
 {
 	design_row[0] = 1.0f;
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index a242a8ed0a1..3185330994c 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -33,7 +33,12 @@ CCL_NAMESPACE_BEGIN
                                      pixel_buffer += buffer_w - (pixel.x - low.x); \
                                  }
 
-ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
+                                               __m128 active_pixels,
+                                               const float *ccl_restrict buffer,
+                                               __m128 *features,
+                                               const __m128 *ccl_restrict mean,
+                                               int pass_stride)
 {
 	features[0] = x;
 	features[1] = y;
@@ -53,7 +58,12 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active
 		features[i] = _mm_mask_ps(features[i], active_pixels);
 }
 
-ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
+                                                     __m128 active_pixels,
+                                                     const float *ccl_restrict buffer,
+                                                     __m128 *scales,
+                                                     const __m128 *ccl_restrict mean,
+                                                     int pass_stride)
 {
 	scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
 	scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 1a314b100be..5cb4038bc33 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -16,27 +16,39 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
+                                                         const float *ccl_restrict weight_image,
+                                                         const float *ccl_restrict variance_image,
+                                                         float *difference_image,
+                                                         int4 rect,
+                                                         int w,
+                                                         int channel_offset,
+                                                         float a,
+                                                         float k_2)
 {
 	for(int y = rect.y; y < rect.w; y++) {
 		for(int x = rect.x; x < rect.z; x++) {
 			float diff = 0.0f;
 			int numChannels = channel_offset? 3 : 1;
 			for(int c = 0; c < numChannels; c++) {
-				float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
-				float pvar = varianceImage[c*channel_offset + y*w+x];
-				float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+				float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+				float pvar = variance_image[c*channel_offset + y*w+x];
+				float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
 				diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
 			}
 			if(numChannels > 1) {
 				diff *= 1.0f/numChannels;
 			}
-			differenceImage[y*w+x] = diff;
+			difference_image[y*w+x] = diff;
 		}
 	}
 }
 
-ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
+                                              float *out_image,
+                                              int4 rect,
+                                              int w,
+                                              int f)
 {
 #ifdef __KERNEL_SSE3__
 	int aligned_lowx = (rect.x & ~(3));
@@ -46,30 +58,34 @@ ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceI
 		const int low = max(rect.y, y-f);
 		const int high = min(rect.w, y+f+1);
 		for(int x = rect.x; x < rect.z; x++) {
-			outImage[y*w+x] = 0.0f;
+			out_image[y*w+x] = 0.0f;
 		}
 		for(int y1 = low; y1 < high; y1++) {
 #ifdef __KERNEL_SSE3__
 			for(int x = aligned_lowx; x < aligned_highx; x+=4) {
-				_mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x)));
+				_mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
 			}
 #else
 			for(int x = rect.x; x < rect.z; x++) {
-				outImage[y*w+x] += differenceImage[y1*w+x];
+				out_image[y*w+x] += difference_image[y1*w+x];
 			}
 #endif
 		}
 		for(int x = rect.x; x < rect.z; x++) {
-			outImage[y*w+x] *= 1.0f/(high - low);
+			out_image[y*w+x] *= 1.0f/(high - low);
 		}
 	}
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                                     float *out_image,
+                                                     int4 rect,
+                                                     int w,
+                                                     int f)
 {
 	for(int y = rect.y; y < rect.w; y++) {
 		for(int x = rect.x; x < rect.z; x++) {
-			outImage[y*w+x] = 0.0f;
+			out_image[y*w+x] = 0.0f;
 		}
 	}
 	for(int dx = -f; dx <= f; dx++) {
@@ -77,7 +93,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr diff
 		int neg_dx = min(0, dx);
 		for(int y = rect.y; y < rect.w; y++) {
 			for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
-				outImage[y*w+x] += differenceImage[y*w+dx+x];
+				out_image[y*w+x] += difference_image[y*w+dx+x];
 			}
 		}
 	}
@@ -85,12 +101,19 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr diff
 		for(int x = rect.x; x < rect.z; x++) {
 			const int low = max(rect.x, x-f);
 			const int high = min(rect.z, x+f+1);
-			outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f));
+			out_image[y*w+x] = expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
 		}
 	}
 }
 
-ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
+                                                       const float *ccl_restrict difference_image,
+                                                       const float *ccl_restrict image,
+                                                       float *out_image,
+                                                       float *accum_image,
+                                                       int4 rect,
+                                                       int w,
+                                                       int f)
 {
 	for(int y = rect.y; y < rect.w; y++) {
 		for(int x = rect.x; x < rect.z; x++) {
@@ -98,18 +121,18 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl
 			const int high = min(rect.z, x+f+1);
 			float sum = 0.0f;
 			for(int x1 = low; x1 < high; x1++) {
-				sum += differenceImage[y*w+x1];
+				sum += difference_image[y*w+x1];
 			}
 			float weight = sum * (1.0f/(high - low));
-			accumImage[y*w+x] += weight;
-			outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+			accum_image[y*w+x] += weight;
+			out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
 		}
 	}
 }
 
 ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
-                                                           float ccl_restrict_ptr differenceImage,
-                                                           float ccl_restrict_ptr buffer,
+                                                           const float *ccl_restrict difference_image,
+                                                           const float *ccl_restrict buffer,
                                                            float *color_pass,
                                                            float *variance_pass,
                                                            float *transform,
@@ -130,7 +153,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
 			const int high = min(rect.z, x+f+1);
 			float sum = 0.0f;
 			for(int x1 = low; x1 < high; x1++) {
-				sum += differenceImage[y*w+x1];
+				sum += difference_image[y*w+x1];
 			}
 			float weight = sum * (1.0f/(high - low));
 
@@ -151,11 +174,14 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
 	}
 }
 
-ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
+ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
+                                                   const float *ccl_restrict accum_image,
+                                                   int4 rect,
+                                                   int w)
 {
 	for(int y = rect.y; y < rect.w; y++) {
 		for(int x = rect.x; x < rect.z; x++) {
-			outImage[y*w+x] /= accumImage[y*w+x];
+			out_image[y*w+x] /= accum_image[y*w+x];
 		}
 	}
 }
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index b5ba7cf51a5..078c5f56763 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -18,9 +18,9 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
                                                          int dx, int dy,
-                                                         ccl_global float ccl_restrict_ptr weightImage,
-                                                         ccl_global float ccl_restrict_ptr varianceImage,
-                                                         ccl_global float *differenceImage,
+                                                         const ccl_global float *ccl_restrict weight_image,
+                                                         const ccl_global float *ccl_restrict variance_image,
+                                                         ccl_global float *difference_image,
                                                          int4 rect, int w,
                                                          int channel_offset,
                                                          float a, float k_2)
@@ -28,78 +28,78 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
 	float diff = 0.0f;
 	int numChannels = channel_offset? 3 : 1;
 	for(int c = 0; c < numChannels; c++) {
-		float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
-		float pvar = varianceImage[c*channel_offset + y*w+x];
-		float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+		float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+		float pvar = variance_image[c*channel_offset + y*w+x];
+		float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
 		diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
 	}
 	if(numChannels > 1) {
 		diff *= 1.0f/numChannels;
 	}
-	differenceImage[y*w+x] = diff;
+	difference_image[y*w+x] = diff;
 }
 
 ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
-                                              ccl_global float ccl_restrict_ptr differenceImage,
-                                              ccl_global float *outImage,
+                                              const ccl_global float *ccl_restrict difference_image,
+                                              ccl_global float *out_image,
                                               int4 rect, int w, int f)
 {
 	float sum = 0.0f;
 	const int low = max(rect.y, y-f);
 	const int high = min(rect.w, y+f+1);
 	for(int y1 = low; y1 < high; y1++) {
-		sum += differenceImage[y1*w+x];
+		sum += difference_image[y1*w+x];
 	}
 	sum *= 1.0f/(high-low);
-	outImage[y*w+x] = sum;
+	out_image[y*w+x] = sum;
 }
 
 ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
-                                                     ccl_global float ccl_restrict_ptr differenceImage,
-                                                     ccl_global float *outImage,
+                                                     const ccl_global float *ccl_restrict difference_image,
+                                                     ccl_global float *out_image,
                                                      int4 rect, int w, int f)
 {
 	float sum = 0.0f;
 	const int low = max(rect.x, x-f);
 	const int high = min(rect.z, x+f+1);
 	for(int x1 = low; x1 < high; x1++) {
-		sum += differenceImage[y*w+x1];
+		sum += difference_image[y*w+x1];
 	}
 	sum *= 1.0f/(high-low);
-	outImage[y*w+x] = expf(-max(sum, 0.0f));
+	out_image[y*w+x] = expf(-max(sum, 0.0f));
 }
 
 ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
                                                        int dx, int dy,
-                                                       ccl_global float ccl_restrict_ptr differenceImage,
-                                                       ccl_global float ccl_restrict_ptr image,
-                                                       ccl_global float *outImage,
-                                                       ccl_global float *accumImage,
+                                                       const ccl_global float *ccl_restrict difference_image,
+                                                       const ccl_global float *ccl_restrict image,
+                                                       ccl_global float *out_image,
+                                                       ccl_global float *accum_image,
                                                        int4 rect, int w, int f)
 {
 	float sum = 0.0f;
 	const int low = max(rect.x, x-f);
 	const int high = min(rect.z, x+f+1);
 	for(int x1 = low; x1 < high; x1++) {
-		sum += differenceImage[y*w+x1];
+		sum += difference_image[y*w+x1];
 	}
 	sum *= 1.0f/(high-low);
-	if(outImage) {
-		accumImage[y*w+x] += sum;
-		outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+	if(out_image) {
+		accum_image[y*w+x] += sum;
+		out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
 	}
 	else {
-		accumImage[y*w+x] = sum;
+		accum_image[y*w+x] = sum;
 	}
 }
 
 ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
                                                            int dx, int dy,
-                                                           ccl_global float ccl_restrict_ptr differenceImage,
-                                                           ccl_global float ccl_restrict_ptr buffer,
+                                                           const ccl_global float *ccl_restrict difference_image,
+                                                           const ccl_global float *ccl_restrict buffer,
                                                            ccl_global float *color_pass,
                                                            ccl_global float *variance_pass,
-                                                           ccl_global float ccl_restrict_ptr transform,
+                                                           const ccl_global float *ccl_restrict transform,
                                                            ccl_global int *rank,
                                                            ccl_global float *XtWX,
                                                            ccl_global float3 *XtWY,
@@ -115,7 +115,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
 	const int high = min(rect.z, x+f+1);
 	float sum = 0.0f;
 	for(int x1 = low; x1 < high; x1++) {
-		sum += differenceImage[y*w+x1];
+		sum += difference_image[y*w+x1];
 	}
 	float weight = sum * (1.0f/(high - low));
 
@@ -137,11 +137,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
 }
 
 ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
-                                                   ccl_global float *outImage,
-                                                   ccl_global float ccl_restrict_ptr accumImage,
+                                                   ccl_global float *out_image,
+                                                   const ccl_global float *ccl_restrict accum_image,
                                                    int4 rect, int w)
 {
-	outImage[y*w+x] /= accumImage[y*w+x];
+	out_image[y*w+x] /= accum_image[y*w+x];
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index 252bcc5e675..82cc36625ec 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -44,7 +44,7 @@ ccl_device void kernel_filter_divide_shadow(int sample,
 
 	int offset = tiles->offsets[tile];
 	int stride = tiles->strides[tile];
-	ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
+	const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile];
 	center_buffer += (y*stride + x + offset)*buffer_pass_stride;
 	center_buffer += buffer_denoising_offset + 14;
 
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index 6a7c86e4012..dc90f318570 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -21,10 +21,10 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        int dx, int dy,
                                                        int w, int h,
                                                        int pass_stride,
-                                                       ccl_global float ccl_restrict_ptr buffer,
+                                                       const ccl_global float *ccl_restrict buffer,
                                                        ccl_global float *color_pass,
                                                        ccl_global float *variance_pass,
-                                                       ccl_global float ccl_restrict_ptr transform,
+                                                       const ccl_global float *ccl_restrict transform,
                                                        ccl_global int *rank,
                                                        float weight,
                                                        ccl_global float *XtWX,
@@ -85,9 +85,17 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
 	const int stride = storage_stride;
 #endif
 
+	/* The weighted average of pixel colors (essentially, the NLM-filtered image).
+	 * In case the solution of the linear model fails due to numerical issues,
+	 * fall back to this value. */
+	float3 mean_color = XtWY[0]/XtWX[0];
+
 	math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
 
 	float3 final_color = XtWY[0];
+	if(!isfinite3_safe(final_color)) {
+		final_color = mean_color;
+	}
 
 	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
 	final_color *= sample;
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
index 139dc402d21..a5f87c05ec0 100644
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
                                                   int x, int y, int4 rect,
                                                   int pass_stride,
                                                   float *transform, int *rank,
@@ -29,20 +29,15 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 	/* Temporary storage, used in different steps of the algorithm. */
 	float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
 	float tempvector[2*DENOISE_FEATURES];
-	float ccl_restrict_ptr pixel_buffer;
+	const float *ccl_restrict pixel_buffer;
 	int2 pixel;
 
-
-
-
 	/* === Calculate denoising window. === */
 	int2 low  = make_int2(max(rect.x, x - radius),
 	                      max(rect.y, y - radius));
 	int2 high = make_int2(min(rect.z, x + radius + 1),
 	                      min(rect.w, y + radius + 1));
-
-
-
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
 
 	/* === Shift feature passes to have mean 0. === */
 	float feature_means[DENOISE_FEATURES];
@@ -52,8 +47,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 		math_vector_add(feature_means, features, DENOISE_FEATURES);
 	} END_FOR_PIXEL_WINDOW
 
-	float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
-	math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+	math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
 
 	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
 	float *feature_scale = tempvector;
@@ -66,7 +60,6 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 
 	filter_calculate_scale(feature_scale);
 
-
 	/* === Generate the feature transformation. ===
 	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
 	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
@@ -80,6 +73,8 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 
 	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
 	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
 	if(pca_threshold < 0.0f) {
 		float threshold_energy = 0.0f;
 		for(int i = 0; i < DENOISE_FEATURES; i++) {
@@ -88,7 +83,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 		threshold_energy *= 1.0f - (-pca_threshold);
 
 		float reduced_energy = 0.0f;
-		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
 			if(i >= 2 && reduced_energy >= threshold_energy)
 				break;
 			float s = feature_matrix[i*DENOISE_FEATURES+i];
@@ -96,7 +91,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 		}
 	}
 	else {
-		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
 			float s = feature_matrix[i*DENOISE_FEATURES+i];
 			if(i >= 2 && sqrtf(s) < pca_threshold)
 				break;
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index 68304e14143..83a1222bbdb 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
                                                   int x, int y, int4 rect,
                                                   int pass_stride,
                                                   ccl_global float *transform,
@@ -38,7 +38,8 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_
 	                      max(rect.y, y - radius));
 	int2 high = make_int2(min(rect.z, x + radius + 1),
 	                      min(rect.w, y + radius + 1));
-	ccl_global float ccl_restrict_ptr pixel_buffer;
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+	const ccl_global float *ccl_restrict pixel_buffer;
 	int2 pixel;
 
 
@@ -52,8 +53,7 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_
 		math_vector_add(feature_means, features, DENOISE_FEATURES);
 	} END_FOR_PIXEL_WINDOW
 
-	float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
-	math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+	math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
 
 	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
 	float feature_scale[DENOISE_FEATURES];
@@ -81,6 +81,8 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_
 
 	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
 	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
 	if(pca_threshold < 0.0f) {
 		float threshold_energy = 0.0f;
 		for(int i = 0; i < DENOISE_FEATURES; i++) {
@@ -89,7 +91,7 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_
 		threshold_energy *= 1.0f - (-pca_threshold);
 
 		float reduced_energy = 0.0f;
-		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
 			if(i >= 2 && reduced_energy >= threshold_energy)
 				break;
 			float s = feature_matrix[i*DENOISE_FEATURES+i];
@@ -97,7 +99,7 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_
 		}
 	}
 	else {
-		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
 			float s = feature_matrix[i*DENOISE_FEATURES+i];
 			if(i >= 2 && sqrtf(s) < pca_threshold)
 				break;
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index ed3a92f6241..30dc2969b11 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
                                                   int x, int y, int4 rect,
                                                   int pass_stride,
                                                   float *transform, int *rank,
@@ -25,13 +25,14 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 	int buffer_w = align_up(rect.z - rect.x, 4);
 
 	__m128 features[DENOISE_FEATURES];
-	float ccl_restrict_ptr pixel_buffer;
+	const float *ccl_restrict pixel_buffer;
 	int2 pixel;
 
 	int2 low  = make_int2(max(rect.x, x - radius),
 	                      max(rect.y, y - radius));
 	int2 high = make_int2(min(rect.z, x + radius + 1),
 	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
 
 	__m128 feature_means[DENOISE_FEATURES];
 	math_vector_zero_sse(feature_means, DENOISE_FEATURES);
@@ -40,7 +41,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 		math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
 	} END_FOR_PIXEL_WINDOW_SSE
 
-	__m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x)));
+	__m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
 		feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
 	}
@@ -68,6 +69,8 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
 
 	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
 	if(pca_threshold < 0.0f) {
 		float threshold_energy = 0.0f;
 		for(int i = 0; i < DENOISE_FEATURES; i++) {
@@ -76,7 +79,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 		threshold_energy *= 1.0f - (-pca_threshold);
 
 		float reduced_energy = 0.0f;
-		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
 			if(i >= 2 && reduced_energy >= threshold_energy)
 				break;
 			float s = feature_matrix[i*DENOISE_FEATURES+i];
@@ -84,7 +87,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
 		}
 	}
 	else {
-		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
 			float s = feature_matrix[i*DENOISE_FEATURES+i];
 			if(i >= 2 && sqrtf(s) < pca_threshold)
 				break;
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 7595e74e2d5..21da180bb8e 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -42,8 +42,6 @@
 #include "util/util_types.h"
 #include "util/util_texture.h"
 
-#define ccl_restrict_ptr const * __restrict
-
 #define ccl_addr_space
 
 #define ccl_local_id(d) 0
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 80d7401fbcf..988126f90e1 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -55,7 +55,6 @@
 #define ccl_restrict __restrict__
 #define ccl_align(n) __align__(n)
 
-#define ccl_restrict_ptr const * __restrict__
 #define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
 
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 15cf4b81b21..c2263ac0d49 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -50,8 +50,6 @@
 #  define ccl_addr_space
 #endif
 
-#define ccl_restrict_ptr const * __restrict__
-
 #define ccl_local_id(d) get_local_id(d)
 #define ccl_global_id(d) get_global_id(d)
 
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index 9708b4b5b58..ffd34c293fc 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -72,40 +72,40 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            int dy,
-                                                           float *weightImage,
+                                                           float *weight_image,
                                                            float *variance,
-                                                           float *differenceImage,
+                                                           float *difference_image,
                                                            int* rect,
                                                            int w,
                                                            int channel_offset,
                                                            float a,
                                                            float k_2);
 
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
-                                                float *outImage,
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+                                                float *out_image,
                                                 int* rect,
                                                 int w,
                                                 int f);
 
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
-                                                       float *outImage,
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+                                                       float *out_image,
                                                        int* rect,
                                                        int w,
                                                        int f);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
-                                                         float *differenceImage,
+                                                         float *difference_image,
                                                          float *image,
-                                                         float *outImage,
-                                                         float *accumImage,
+                                                         float *out_image,
+                                                         float *accum_image,
                                                          int* rect,
                                                          int w,
                                                          int f);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int dy,
-                                                             float *differenceImage,
+                                                             float *difference_image,
                                                              float *buffer,
                                                              float *color_pass,
                                                              float *variance_pass,
@@ -120,8 +120,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int f,
                                                              int pass_stride);
 
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
-                                                     float *accumImage,
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+                                                     float *accum_image,
                                                      int* rect,
                                                      int w);
 
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 15325abdccd..261176846b1 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -150,9 +150,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            int dy,
-                                                           float *weightImage,
+                                                           float *weight_image,
                                                            float *variance,
-                                                           float *differenceImage,
+                                                           float *difference_image,
                                                            int *rect,
                                                            int w,
                                                            int channel_offset,
@@ -162,12 +162,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
 #else
-	kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
+	kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2);
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
-                                                float *outImage,
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+                                                float *out_image,
                                                 int *rect,
                                                 int w,
                                                 int f)
@@ -175,12 +175,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
 #else
-	kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
+	kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f);
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
-                                                       float *outImage,
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+                                                       float *out_image,
                                                        int *rect,
                                                        int w,
                                                        int f)
@@ -188,16 +188,16 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
 #else
-	kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
+	kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
-                                                         float *differenceImage,
+                                                         float *difference_image,
                                                          float *image,
-                                                         float *outImage,
-                                                         float *accumImage,
+                                                         float *out_image,
+                                                         float *accum_image,
                                                          int *rect,
                                                          int w,
                                                          int f)
@@ -205,13 +205,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
 #else
-	kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
+	kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int dy,
-                                                             float *differenceImage,
+                                                             float *difference_image,
                                                              float *buffer,
                                                              float *color_pass,
                                                              float *variance_pass,
@@ -229,19 +229,19 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
 #else
-    kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+    kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
-                                                     float *accumImage,
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+                                                     float *accum_image,
                                                      int *rect,
                                                      int w)
 {
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
 #else
-	kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
+	kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w);
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index f812a6601c6..2edbff08087 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -139,69 +139,74 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
-                                       float ccl_restrict_ptr weightImage,
-                                       float ccl_restrict_ptr varianceImage,
-                                       float *differenceImage,
+                                       const float *ccl_restrict weight_image,
+                                       const float *ccl_restrict variance_image,
+                                       float *difference_image,
                                        int4 rect, int w,
                                        int channel_offset,
-                                       float a, float k_2) {
+                                       float a, float k_2)
+{
 	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
 	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
 	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
 	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+		kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
 	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
 	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+		kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_nlm_update_output(int dx, int dy,
-                                     float ccl_restrict_ptr differenceImage,
-                                     float ccl_restrict_ptr image,
-                                     float *outImage, float *accumImage,
+                                     const float *ccl_restrict difference_image,
+                                     const float *ccl_restrict image,
+                                     float *out_image, float *accum_image,
                                      int4 rect, int w,
-                                     int f) {
+                                     int f)
+{
 	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
 	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+		kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
+kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
+{
 	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
 	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
-                                         float ccl_restrict_ptr differenceImage,
-                                         float ccl_restrict_ptr buffer,
+                                         const float *ccl_restrict difference_image,
+                                         const float *ccl_restrict buffer,
                                          float *color_pass,
                                          float *variance_pass,
                                          float const* __restrict__ transform,
@@ -211,13 +216,14 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
                                          int4 rect,
                                          int4 filter_rect,
                                          int w, int h, int f,
-                                         int pass_stride) {
+                                         int pass_stride)
+{
 	int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
 	int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
 	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
 		kernel_filter_nlm_construct_gramian(x, y,
 		                                    dx, dy,
-		                                    differenceImage,
+		                                    difference_image,
 		                                    buffer,
 		                                    color_pass, variance_pass,
 		                                    transform, rank,
@@ -235,7 +241,8 @@ kernel_cuda_filter_finalize(int w, int h,
                             float *buffer, int *rank,
                             float *XtWX, float3 *XtWY,
                             int4 filter_area, int4 buffer_params,
-                            int sample) {
+                            int sample)
+{
 	int x = blockDim.x*blockIdx.x + threadIdx.x;
 	int y = blockDim.y*blockIdx.y + threadIdx.y;
 	if(x < filter_area.z && y < filter_area.w) {
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index fbc3daa62b9..0462ca6f9bc 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -106,7 +106,7 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
 	}
 }
 
-__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
                                                     ccl_global float *transform,
                                                     ccl_global int *rank,
                                                     int4 filter_area,
@@ -132,79 +132,84 @@ __kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restric
 
 __kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
                                                     int dy,
-                                                    ccl_global float ccl_restrict_ptr weightImage,
-                                                    ccl_global float ccl_restrict_ptr varianceImage,
-                                                    ccl_global float *differenceImage,
+                                                    const ccl_global float *ccl_restrict weight_image,
+                                                    const ccl_global float *ccl_restrict variance_image,
+                                                    ccl_global float *difference_image,
                                                     int4 rect,
                                                     int w,
                                                     int channel_offset,
                                                     float a,
-                                                    float k_2) {
+                                                    float k_2)
+{
 	int x = get_global_id(0) + rect.x;
 	int y = get_global_id(1) + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
 	}
 }
 
-__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
-                                         ccl_global float *outImage,
+__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
+                                         ccl_global float *out_image,
                                          int4 rect,
                                          int w,
-                                         int f) {
+                                         int f)
+{
 	int x = get_global_id(0) + rect.x;
 	int y = get_global_id(1) + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+		kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
 	}
 }
 
-__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
-                                                ccl_global float *outImage,
+__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
+                                                ccl_global float *out_image,
                                                 int4 rect,
                                                 int w,
-                                                int f) {
+                                                int f)
+{
 	int x = get_global_id(0) + rect.x;
 	int y = get_global_id(1) + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+		kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
 	}
 }
 
 __kernel void kernel_ocl_filter_nlm_update_output(int dx,
                                                   int dy,
-                                                  ccl_global float ccl_restrict_ptr differenceImage,
-                                                  ccl_global float ccl_restrict_ptr image,
-                                                  ccl_global float *outImage,
-                                                  ccl_global float *accumImage,
+                                                  const ccl_global float *ccl_restrict difference_image,
+                                                  const ccl_global float *ccl_restrict image,
+                                                  ccl_global float *out_image,
+                                                  ccl_global float *accum_image,
                                                   int4 rect,
                                                   int w,
-                                                  int f) {
+                                                  int f)
+{
 	int x = get_global_id(0) + rect.x;
 	int y = get_global_id(1) + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+		kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
 	}
 }
 
-__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
-                                              ccl_global float ccl_restrict_ptr accumImage,
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
+                                              const ccl_global float *ccl_restrict accum_image,
                                               int4 rect,
-                                              int w) {
+                                              int w)
+{
 	int x = get_global_id(0) + rect.x;
 	int y = get_global_id(1) + rect.y;
 	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
 	}
 }
 
 __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
                                                       int dy,
-                                                      ccl_global float ccl_restrict_ptr differenceImage,
-                                                      ccl_global float ccl_restrict_ptr buffer,
+                                                      const ccl_global float *ccl_restrict difference_image,
+                                                      const ccl_global float *ccl_restrict buffer,
                                                       ccl_global float *color_pass,
                                                       ccl_global float *variance_pass,
-                                                      ccl_global float ccl_restrict_ptr transform,
+                                                      const ccl_global float *ccl_restrict transform,
                                                       ccl_global int *rank,
                                                       ccl_global float *XtWX,
                                                       ccl_global float3 *XtWY,
@@ -213,13 +218,14 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
                                                       int w,
                                                       int h,
                                                       int f,
-                                                      int pass_stride) {
+                                                      int pass_stride)
+{
 	int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
 	int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
 	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
 		kernel_filter_nlm_construct_gramian(x, y,
 		                                    dx, dy,
-		                                    differenceImage,
+		                                    difference_image,
 		                                    buffer,
 		                                    color_pass, variance_pass,
 		                                    transform, rank,
@@ -239,7 +245,8 @@ __kernel void kernel_ocl_filter_finalize(int w,
                                          ccl_global float3 *XtWY,
                                          int4 filter_area,
                                          int4 buffer_params,
-                                         int sample) {
+                                         int sample)
+{
 	int x = get_global_id(0);
 	int y = get_global_id(1);
 	if(x < filter_area.z && y < filter_area.w) {
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 1f6dce0253c..4c1fdd2d69c 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -111,7 +111,6 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		kernel_write_light_passes(kg, buffer, L, sample);
 #ifdef __KERNEL_DEBUG__
 		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
 #endif
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 3f080407b1f..08909943c49 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -726,16 +726,20 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	return requested_features;
 }
 
-void Session::load_kernels()
+void Session::load_kernels(bool lock_scene)
 {
-	thread_scoped_lock scene_lock(scene->mutex);
+	thread_scoped_lock scene_lock;
+	if(lock_scene) {
+		scene_lock = thread_scoped_lock(scene->mutex);
+	}
 
-	if(!kernels_loaded) {
+	DeviceRequestedFeatures requested_features = get_requested_device_features();
+
+	if(!kernels_loaded || loaded_kernel_features.modified(requested_features)) {
 		progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
 		scoped_timer timer;
 
-		DeviceRequestedFeatures requested_features = get_requested_device_features();
 		VLOG(2) << "Requested features:\n" << requested_features;
 		if(!device->load_kernels(requested_features)) {
 			string message = device->error_message();
@@ -752,6 +756,7 @@ void Session::load_kernels()
 		VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start();
 
 		kernels_loaded = true;
+		loaded_kernel_features = requested_features;
 	}
 }
 
@@ -902,6 +907,8 @@ void Session::update_scene()
 
 	/* update scene */
 	if(scene->need_update()) {
+		load_kernels(false);
+
 		progress.set_status("Updating Scene");
 		MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
 	}
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index a7ca90abbce..5fb1a365ee9 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -158,7 +158,7 @@ public:
 	void set_pause(bool pause);
 
 	void update_scene();
-	void load_kernels();
+	void load_kernels(bool lock_scene=true);
 
 	void device_free();
 
@@ -215,6 +215,7 @@ protected:
 	thread_mutex display_mutex;
 
 	bool kernels_loaded;
+	DeviceRequestedFeatures loaded_kernel_features;
 
 	double reset_time;
 
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index a754be413fe..5327d9f7cc6 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -367,6 +367,11 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
 #endif
 }
 
+ccl_device_inline bool isfinite3_safe(float3 v)
+{
+	return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z);
+}
+
 ccl_device_inline float3 ensure_finite3(float3 v)
 {
 	if(!isfinite_safe(v.x)) v.x = 0.0;
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 2172e94a14f..c7511f8306e 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -23,73 +23,83 @@ CCL_NAMESPACE_BEGIN
 
 /* Variants that use a constant stride on GPUS. */
 #ifdef __KERNEL_GPU__
-#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+#  define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
 /* Element access when only the lower-triangular elements are stored. */
-#define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
-#define VECS(V, i, s) V[(i)*(s)]
+#  define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
+#  define VECS(V, i, s) V[(i)*(s)]
 #else
-#define MATS(A, n, r, c, s) MAT(A, n, r, c)
-#define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
-#define VECS(V, i, s) V[i]
+#  define MATS(A, n, r, c, s) MAT(A, n, r, c)
+#  define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
+#  define VECS(V, i, s) V[i]
 #endif
 
 /* Zeroing helpers. */
 
 ccl_device_inline void math_vector_zero(float *v, int n)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		v[i] = 0.0f;
+	}
 }
 
 ccl_device_inline void math_matrix_zero(float *A, int n)
 {
-	for(int row = 0; row < n; row++)
-		for(int col = 0; col <= row; col++)
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
 			MAT(A, n, row, col) = 0.0f;
+		}
+	}
 }
 
 /* Elementary vector operations. */
 
-ccl_device_inline void math_vector_add(float *a, float ccl_restrict_ptr b, int n)
+ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		a[i] += b[i];
+	}
 }
 
-ccl_device_inline void math_vector_mul(float *a, float ccl_restrict_ptr b, int n)
+ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		a[i] *= b[i];
+	}
 }
 
-ccl_device_inline void math_vector_mul_strided(ccl_global float *a, float ccl_restrict_ptr b, int astride, int n)
+ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		a[i*astride] *= b[i];
+	}
 }
 
 ccl_device_inline void math_vector_scale(float *a, float b, int n)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		a[i] *= b;
+	}
 }
 
-ccl_device_inline void math_vector_max(float *a, float ccl_restrict_ptr b, int n)
+ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		a[i] = max(a[i], b[i]);
+	}
 }
 
 ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		v[i] += w*x[i];
+	}
 }
 
 ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		v[i*stride] += w*x[i];
+	}
 }
 
 /* Elementary matrix operations.
@@ -97,33 +107,38 @@ ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float
 
 ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
 {
-	for(int row = 0; row < n; row++)
+	for(int row = 0; row < n; row++) {
 		MATHS(A, row, row, stride) += val;
+	}
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
 ccl_device_inline void math_matrix_add_gramian(float *A,
                                                   int n,
-                                                  float ccl_restrict_ptr v,
+                                                  const float *ccl_restrict v,
                                                   float weight)
 {
-	for(int row = 0; row < n; row++)
-		for(int col = 0; col <= row; col++)
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
 			MAT(A, n, row, col) += v[row]*v[col]*weight;
+		}
+	}
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
 ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
                                                           int n,
-                                                          float ccl_restrict_ptr v,
+                                                          const float *ccl_restrict v,
                                                           float weight,
                                                           int stride)
 {
-	for(int row = 0; row < n; row++)
-		for(int col = 0; col <= row; col++)
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
 			MATHS(A, row, col, stride) += v[row]*v[col]*weight;
+		}
+	}
 }
 
 /* Transpose matrix A inplace. */
@@ -138,9 +153,6 @@ ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int str
 	}
 }
 
-
-
-
 /* Solvers for matrix problems */
 
 /* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
@@ -199,10 +211,6 @@ ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global
 	}
 }
 
-
-
-
-
 /* Perform the Jacobi Eigenvalue Methon on matrix A.
  * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
  * The algorithm overwrites the contents of A.
@@ -215,15 +223,19 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 {
 	const float singular_epsilon = 1e-9f;
 
-	for (int row = 0; row < n; row++)
-		for (int col = 0; col < n; col++)
+	for (int row = 0; row < n; row++) {
+		for (int col = 0; col < n; col++) {
 			MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
+		}
+	}
 
 	for (int sweep = 0; sweep < 8; sweep++) {
 		float off_diagonal = 0.0f;
-		for (int row = 1; row < n; row++)
-			for (int col = 0; col < row; col++)
+		for (int row = 1; row < n; row++) {
+			for (int col = 0; col < row; col++) {
 				off_diagonal += fabsf(MAT(A, n, row, col));
+			}
+		}
 		if (off_diagonal < 1e-7f) {
 			/* The matrix has nearly reached diagonal form.
 			 * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
@@ -327,51 +339,61 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 }
 
 #ifdef __KERNEL_SSE3__
-
 ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		A[i] = _mm_setzero_ps();
+	}
 }
+
 ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
 {
-	for(int row = 0; row < n; row++)
-		for(int col = 0; col <= row; col++)
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
 			MAT(A, n, row, col) = _mm_setzero_ps();
+		}
+	}
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, __m128 ccl_restrict_ptr v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
 {
-	for(int row = 0; row < n; row++)
-		for(int col = 0; col <= row; col++)
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
 			MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+		}
+	}
 }
 
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a)
+ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		V[i] = _mm_add_ps(V[i], a[i]);
+	}
 }
 
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a)
+ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		V[i] = _mm_mul_ps(V[i], a[i]);
+	}
 }
 
-ccl_device_inline void math_vector_max_sse(__m128 *a, __m128 ccl_restrict_ptr b, int n)
+ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
 {
-	for(int i = 0; i < n; i++)
+	for(int i = 0; i < n; i++) {
 		a[i] = _mm_max_ps(a[i], b[i]);
+	}
 }
 
-ccl_device_inline void math_matrix_hsum(float *A, int n, __m128 ccl_restrict_ptr B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
 {
-	for(int row = 0; row < n; row++)
-		for(int col = 0; col <= row; col++)
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
 			MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+		}
+	}
 }
 #endif
 
diff --git a/intern/string/STR_HashedString.h b/intern/string/STR_HashedString.h
index 8bfbde65895..ce790f398a0 100644
--- a/intern/string/STR_HashedString.h
+++ b/intern/string/STR_HashedString.h
@@ -38,6 +38,14 @@
 
 #include "STR_String.h"
 
+/* copied from 'BLI_compiler_attrs.h' */
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+#if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
+#define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#else
+#define ATTR_FALLTHROUGH ((void)0)
+#endif
+
 
 // Hash Mix utility function, by Bob Jenkins - Mix 3 32-bit values reversibly
 //
@@ -102,16 +110,16 @@ static dword STR_gHash(const void *in, int len, dword init_val)
 	// Handle the last 11 bytes
 	c += len;
 	switch (length) {
-		case 11: c += ((dword)p_in[10] << 24);
-		case 10: c += ((dword)p_in[9]  << 16);
-		case  9: c += ((dword)p_in[8]  << 8);  /* the first byte of c is reserved for the length */
-		case  8: b += ((dword)p_in[7]  << 24);
-		case  7: b += ((dword)p_in[6]  << 16);
-		case  6: b += ((dword)p_in[5]  << 8);
-		case  5: b += p_in[4];
-		case  4: a += ((dword)p_in[3]  << 24);
-		case  3: a += ((dword)p_in[2]  << 16);
-		case  2: a += ((dword)p_in[1]  << 8);
+		case 11: c += ((dword)p_in[10] << 24); ATTR_FALLTHROUGH;
+		case 10: c += ((dword)p_in[9]  << 16); ATTR_FALLTHROUGH;
+		case  9: c += ((dword)p_in[8]  << 8);  ATTR_FALLTHROUGH; /* the first byte of c is reserved for the length */
+		case  8: b += ((dword)p_in[7]  << 24); ATTR_FALLTHROUGH;
+		case  7: b += ((dword)p_in[6]  << 16); ATTR_FALLTHROUGH;
+		case  6: b += ((dword)p_in[5]  << 8);  ATTR_FALLTHROUGH;
+		case  5: b += p_in[4];                 ATTR_FALLTHROUGH;
+		case  4: a += ((dword)p_in[3]  << 24); ATTR_FALLTHROUGH;
+		case  3: a += ((dword)p_in[2]  << 16); ATTR_FALLTHROUGH;
+		case  2: a += ((dword)p_in[1]  << 8);  ATTR_FALLTHROUGH;
 		case  1: a += p_in[0];
 	}
 	STR_gHashMix(a, b, c);
author	Campbell Barton <ideasman42@gmail.com>	2017-05-20 07:19:05 +0300
committer	Campbell Barton <ideasman42@gmail.com>	2017-05-20 07:19:05 +0300
commit	65aab6cdae822822122f5e180baf10e39f600bf2 (patch)
tree	fc8556ec5590a0e47c41e2c18131fede4b67732b /intern
parent	996bf65730257d0a80c6ada03a38d0a321b1e87e (diff)
parent	81e584ed17902878579131776b4e5a9f7b54cdab (diff)