ClangFormat: apply to source, most of intern

Apply clang format as proposed in T53211. For details on usage and instructions for migrating branches without conflicts, see: https://wiki.blender.org/wiki/Tools/ClangFormat
author: Campbell Barton <ideasman42@gmail.com> 2019-04-17 07:17:24 +0300
committer: Campbell Barton <ideasman42@gmail.com> 2019-04-17 07:21:24 +0300
commit: e12c08e8d170b7ca40f204a5b0423c23a9fbc2c1 (patch)
tree: 8cf3453d12edb177a218ef8009357518ec6cab6a /intern/cycles/kernel/filter
parent: b3dabc200a4b0399ec6b81f2ff2730d07b44fcaa (diff)
11 files changed, 1101 insertions, 998 deletions
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
index 4209d69ee73..b067e53a8bf 100644
--- a/intern/cycles/kernel/filter/filter.h
+++ b/intern/cycles/kernel/filter/filter.h
@@ -25,8 +25,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
-#define KERNEL_NAME_EVAL(arch, name)  KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
 #define KERNEL_ARCH cpu
@@ -49,4 +49,4 @@ CCL_NAMESPACE_BEGIN
 
 CCL_NAMESPACE_END
 
-#endif  /* __FILTER_H__ */
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index cb04aac35f4..0e51eeef92f 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -18,59 +18,56 @@
 #define __FILTER_DEFINES_H__
 
 #define DENOISE_FEATURES 11
-#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
-#define XTWX_SIZE      (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
-#define XTWY_SIZE      (DENOISE_FEATURES+1)
+#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES)
+#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2)
+#define XTWY_SIZE (DENOISE_FEATURES + 1)
 
 #define DENOISE_MAX_FRAMES 16
 
 typedef struct TileInfo {
-	int offsets[9];
-	int strides[9];
-	int x[4];
-	int y[4];
-	int from_render;
-	int frames[DENOISE_MAX_FRAMES];
-	int num_frames;
-	/* TODO(lukas): CUDA doesn't have uint64_t... */
+  int offsets[9];
+  int strides[9];
+  int x[4];
+  int y[4];
+  int from_render;
+  int frames[DENOISE_MAX_FRAMES];
+  int num_frames;
+  /* TODO(lukas): CUDA doesn't have uint64_t... */
 #ifdef __KERNEL_OPENCL__
-	ccl_global float *buffers[9];
+  ccl_global float *buffers[9];
 #else
-	long long int buffers[9];
+  long long int buffers[9];
 #endif
 } TileInfo;
 
 #ifdef __KERNEL_OPENCL__
-#  define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info,  \
-                               ccl_global float *tile_buffer_1, \
-                               ccl_global float *tile_buffer_2, \
-                               ccl_global float *tile_buffer_3, \
-                               ccl_global float *tile_buffer_4, \
-                               ccl_global float *tile_buffer_5, \
-                               ccl_global float *tile_buffer_6, \
-                               ccl_global float *tile_buffer_7, \
-                               ccl_global float *tile_buffer_8, \
-                               ccl_global float *tile_buffer_9
-#  define CCL_FILTER_TILE_INFO_ARG tile_info, \
-                                   tile_buffer_1, tile_buffer_2, tile_buffer_3, \
-                                   tile_buffer_4, tile_buffer_5, tile_buffer_6, \
-                                   tile_buffer_7, tile_buffer_8, tile_buffer_9
-#  define ccl_get_tile_buffer(id) (id == 0 ? tile_buffer_1 \
-                                   : id == 1 ? tile_buffer_2 \
-                                   : id == 2 ? tile_buffer_3 \
-                                   : id == 3 ? tile_buffer_4 \
-                                   : id == 4 ? tile_buffer_5 \
-                                   : id == 5 ? tile_buffer_6 \
-                                   : id == 6 ? tile_buffer_7 \
-                                   : id == 7 ? tile_buffer_8 \
-                                   : tile_buffer_9)
+#  define CCL_FILTER_TILE_INFO \
+    ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \
+        ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \
+        ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \
+        ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \
+        ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9
+#  define CCL_FILTER_TILE_INFO_ARG \
+    tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \
+        tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9
+#  define ccl_get_tile_buffer(id) \
+    (id == 0 ? tile_buffer_1 : \
+               id == 1 ? \
+               tile_buffer_2 : \
+               id == 2 ? \
+               tile_buffer_3 : \
+               id == 3 ? tile_buffer_4 : \
+                         id == 4 ? tile_buffer_5 : \
+                                   id == 5 ? tile_buffer_6 : \
+                                             id == 6 ? tile_buffer_7 : \
+                                                       id == 7 ? tile_buffer_8 : tile_buffer_9)
 #else
 #  ifdef __KERNEL_CUDA__
-#    define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info
+#    define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info
 #  else
-#    define CCL_FILTER_TILE_INFO TileInfo* tile_info
+#    define CCL_FILTER_TILE_INFO TileInfo *tile_info
 #  endif
 #  define ccl_get_tile_buffer(id) (tile_info->buffers[id])
 #endif
 
-#endif  /* __FILTER_DEFINES_H__*/
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index e1ea6487aa9..809ccfe8be6 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -14,22 +14,25 @@
  * limitations under the License.
  */
 
- CCL_NAMESPACE_BEGIN
+CCL_NAMESPACE_BEGIN
 
 #define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass.
  * Repeat the loop for every secondary frame if there are any. */
-#define FOR_PIXEL_WINDOW     for(int frame = 0; frame < tile_info->num_frames; frame++) { \
-                                 pixel.z = tile_info->frames[frame]; \
-                                 pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
-                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+#define FOR_PIXEL_WINDOW \
+  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
+    pixel.z = tile_info->frames[frame]; \
+    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
+                   frame * frame_stride; \
+    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+      for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
 
-#define END_FOR_PIXEL_WINDOW         } \
-                                     pixel_buffer += buffer_w - (high.x - low.x); \
-                                 } \
-                             }
+#define END_FOR_PIXEL_WINDOW \
+  } \
+  pixel_buffer += buffer_w - (high.x - low.x); \
+  } \
+  }
 
 ccl_device_inline void filter_get_features(int3 pixel,
                                            const ccl_global float *ccl_restrict buffer,
@@ -38,24 +41,24 @@ ccl_device_inline void filter_get_features(int3 pixel,
                                            const float *ccl_restrict mean,
                                            int pass_stride)
 {
-	features[0] = pixel.x;
-	features[1] = pixel.y;
-	features[2] = fabsf(ccl_get_feature(buffer, 0));
-	features[3] = ccl_get_feature(buffer, 1);
-	features[4] = ccl_get_feature(buffer, 2);
-	features[5] = ccl_get_feature(buffer, 3);
-	features[6] = ccl_get_feature(buffer, 4);
-	features[7] = ccl_get_feature(buffer, 5);
-	features[8] = ccl_get_feature(buffer, 6);
-	features[9] = ccl_get_feature(buffer, 7);
-	if(use_time) {
-		features[10] = pixel.z;
-	}
-	if(mean) {
-		for(int i = 0; i < (use_time? 11 : 10); i++) {
-			features[i] -= mean[i];
-		}
-	}
+  features[0] = pixel.x;
+  features[1] = pixel.y;
+  features[2] = fabsf(ccl_get_feature(buffer, 0));
+  features[3] = ccl_get_feature(buffer, 1);
+  features[4] = ccl_get_feature(buffer, 2);
+  features[5] = ccl_get_feature(buffer, 3);
+  features[6] = ccl_get_feature(buffer, 4);
+  features[7] = ccl_get_feature(buffer, 5);
+  features[8] = ccl_get_feature(buffer, 6);
+  features[9] = ccl_get_feature(buffer, 7);
+  if (use_time) {
+    features[10] = pixel.z;
+  }
+  if (mean) {
+    for (int i = 0; i < (use_time ? 11 : 10); i++) {
+      features[i] -= mean[i];
+    }
+  }
 }
 
 ccl_device_inline void filter_get_feature_scales(int3 pixel,
@@ -65,38 +68,39 @@ ccl_device_inline void filter_get_feature_scales(int3 pixel,
                                                  const float *ccl_restrict mean,
                                                  int pass_stride)
 {
-	scales[0] = fabsf(pixel.x - mean[0]);
-	scales[1] = fabsf(pixel.y - mean[1]);
-	scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
-	scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
-	                                    ccl_get_feature(buffer, 2) - mean[4],
-	                                    ccl_get_feature(buffer, 3) - mean[5]));
-	scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
-	scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
-	                                    ccl_get_feature(buffer, 6) - mean[8],
-	                                    ccl_get_feature(buffer, 7) - mean[9]));
-	if(use_time) {
-		scales[6] = fabsf(pixel.z - mean[10]);
-	}
+  scales[0] = fabsf(pixel.x - mean[0]);
+  scales[1] = fabsf(pixel.y - mean[1]);
+  scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
+  scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+                                      ccl_get_feature(buffer, 2) - mean[4],
+                                      ccl_get_feature(buffer, 3) - mean[5]));
+  scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+  scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+                                      ccl_get_feature(buffer, 6) - mean[8],
+                                      ccl_get_feature(buffer, 7) - mean[9]));
+  if (use_time) {
+    scales[6] = fabsf(pixel.z - mean[10]);
+  }
 }
 
 ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
 {
-	scale[0] = 1.0f/max(scale[0], 0.01f);
-	scale[1] = 1.0f/max(scale[1], 0.01f);
-	scale[2] = 1.0f/max(scale[2], 0.01f);
-	if(use_time) {
-		scale[10] = 1.0f/max(scale[6], 0.01f);
-	}
-	scale[6] = 1.0f/max(scale[4], 0.01f);
-	scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
-	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+  scale[0] = 1.0f / max(scale[0], 0.01f);
+  scale[1] = 1.0f / max(scale[1], 0.01f);
+  scale[2] = 1.0f / max(scale[2], 0.01f);
+  if (use_time) {
+    scale[10] = 1.0f / max(scale[6], 0.01f);
+  }
+  scale[6] = 1.0f / max(scale[4], 0.01f);
+  scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f);
+  scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f);
 }
 
 ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
                                           int pass_stride)
 {
-	return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
+  return make_float3(
+      ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
 }
 
 ccl_device_inline void design_row_add(float *design_row,
@@ -107,42 +111,44 @@ ccl_device_inline void design_row_add(float *design_row,
                                       float feature,
                                       int transform_row_stride)
 {
-	for(int i = 0; i < rank; i++) {
-		design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature;
-	}
+  for (int i = 0; i < rank; i++) {
+    design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature;
+  }
 }
 
 /* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(int3 p_pixel,
-                                                       const ccl_global float *ccl_restrict p_buffer,
-                                                       int3 q_pixel,
-                                                       const ccl_global float *ccl_restrict q_buffer,
-                                                       int pass_stride,
-                                                       int rank,
-                                                       float *design_row,
-                                                       const ccl_global float *ccl_restrict transform,
-                                                       int stride,
-                                                       bool use_time)
+ccl_device_inline void filter_get_design_row_transform(
+    int3 p_pixel,
+    const ccl_global float *ccl_restrict p_buffer,
+    int3 q_pixel,
+    const ccl_global float *ccl_restrict q_buffer,
+    int pass_stride,
+    int rank,
+    float *design_row,
+    const ccl_global float *ccl_restrict transform,
+    int stride,
+    bool use_time)
 {
-	int num_features = use_time? 11 : 10;
+  int num_features = use_time ? 11 : 10;
 
-	design_row[0] = 1.0f;
-	math_vector_zero(design_row+1, rank);
+  design_row[0] = 1.0f;
+  math_vector_zero(design_row + 1, rank);
 
-#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features);
-	DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
-	DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
-	DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
-	DESIGN_ROW_ADD(3,       ccl_get_feature(q_buffer, 1)  -       ccl_get_feature(p_buffer, 1));
-	DESIGN_ROW_ADD(4,       ccl_get_feature(q_buffer, 2)  -       ccl_get_feature(p_buffer, 2));
-	DESIGN_ROW_ADD(5,       ccl_get_feature(q_buffer, 3)  -       ccl_get_feature(p_buffer, 3));
-	DESIGN_ROW_ADD(6,       ccl_get_feature(q_buffer, 4)  -       ccl_get_feature(p_buffer, 4));
-	DESIGN_ROW_ADD(7,       ccl_get_feature(q_buffer, 5)  -       ccl_get_feature(p_buffer, 5));
-	DESIGN_ROW_ADD(8,       ccl_get_feature(q_buffer, 6)  -       ccl_get_feature(p_buffer, 6));
-	DESIGN_ROW_ADD(9,       ccl_get_feature(q_buffer, 7)  -       ccl_get_feature(p_buffer, 7));
-	if(use_time) {
-		DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
-	}
+#define DESIGN_ROW_ADD(I, F) \
+  design_row_add(design_row, rank, transform, stride, I, F, num_features);
+  DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
+  DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
+  DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+  DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+  DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+  DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+  DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+  DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+  DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+  DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+  if (use_time) {
+    DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
+  }
 #undef DESIGN_ROW_ADD
 }
 
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 5dd001ffb93..1e0d6e93453 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -22,22 +22,27 @@ CCL_NAMESPACE_BEGIN
  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
  * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window.
  * Repeat the loop for every secondary frame if there are any. */
-#define FOR_PIXEL_WINDOW_SSE     for(int frame = 0; frame < tile_info->num_frames; frame++) { \
-                                     pixel.z = tile_info->frames[frame]; \
-                                     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
-                                     float4 t4 = make_float4(pixel.z); \
-                                     for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                         float4 y4 = make_float4(pixel.y); \
-                                         for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-                                             float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
-                                             int4 active_pixels = x4 < make_float4(high.x);
+#define FOR_PIXEL_WINDOW_SSE \
+  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
+    pixel.z = tile_info->frames[frame]; \
+    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
+                   frame * frame_stride; \
+    float4 t4 = make_float4(pixel.z); \
+    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+      float4 y4 = make_float4(pixel.y); \
+      for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+        float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+        int4 active_pixels = x4 < make_float4(high.x);
 
-#define END_FOR_PIXEL_WINDOW_SSE         } \
-                                         pixel_buffer += buffer_w - (high.x - low.x); \
-                                     } \
-                                 }
+#define END_FOR_PIXEL_WINDOW_SSE \
+  } \
+  pixel_buffer += buffer_w - (high.x - low.x); \
+  } \
+  }
 
-ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t,
+ccl_device_inline void filter_get_features_sse(float4 x,
+                                               float4 y,
+                                               float4 t,
                                                int4 active_pixels,
                                                const float *ccl_restrict buffer,
                                                float4 *features,
@@ -45,33 +50,35 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t,
                                                const float4 *ccl_restrict mean,
                                                int pass_stride)
 {
-	int num_features = use_time? 11 : 10;
+  int num_features = use_time ? 11 : 10;
 
-	features[0] = x;
-	features[1] = y;
-	features[2] = fabs(ccl_get_feature_sse(0));
-	features[3] = ccl_get_feature_sse(1);
-	features[4] = ccl_get_feature_sse(2);
-	features[5] = ccl_get_feature_sse(3);
-	features[6] = ccl_get_feature_sse(4);
-	features[7] = ccl_get_feature_sse(5);
-	features[8] = ccl_get_feature_sse(6);
-	features[9] = ccl_get_feature_sse(7);
-	if(use_time) {
-		features[10] = t;
-	}
+  features[0] = x;
+  features[1] = y;
+  features[2] = fabs(ccl_get_feature_sse(0));
+  features[3] = ccl_get_feature_sse(1);
+  features[4] = ccl_get_feature_sse(2);
+  features[5] = ccl_get_feature_sse(3);
+  features[6] = ccl_get_feature_sse(4);
+  features[7] = ccl_get_feature_sse(5);
+  features[8] = ccl_get_feature_sse(6);
+  features[9] = ccl_get_feature_sse(7);
+  if (use_time) {
+    features[10] = t;
+  }
 
-	if(mean) {
-		for(int i = 0; i < num_features; i++) {
-			features[i] = features[i] - mean[i];
-		}
-	}
-	for(int i = 0; i < num_features; i++) {
-		features[i] = mask(active_pixels, features[i]);
-	}
+  if (mean) {
+    for (int i = 0; i < num_features; i++) {
+      features[i] = features[i] - mean[i];
+    }
+  }
+  for (int i = 0; i < num_features; i++) {
+    features[i] = mask(active_pixels, features[i]);
+  }
 }
 
-ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x,
+                                                     float4 y,
+                                                     float4 t,
                                                      int4 active_pixels,
                                                      const float *ccl_restrict buffer,
                                                      float4 *scales,
@@ -79,36 +86,34 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4
                                                      const float4 *ccl_restrict mean,
                                                      int pass_stride)
 {
-	scales[0] = fabs(x - mean[0]);
-	scales[1] = fabs(y - mean[1]);
-	scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
-	scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
-	            sqr(ccl_get_feature_sse(2) - mean[4]) +
-	            sqr(ccl_get_feature_sse(3) - mean[5]);
-	scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
-	scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
-	            sqr(ccl_get_feature_sse(6) - mean[8]) +
-	            sqr(ccl_get_feature_sse(7) - mean[9]);
-	if(use_time) {
-		scales[6] = fabs(t - mean[10]);
-	}
+  scales[0] = fabs(x - mean[0]);
+  scales[1] = fabs(y - mean[1]);
+  scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+  scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) +
+              sqr(ccl_get_feature_sse(3) - mean[5]);
+  scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+  scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) +
+              sqr(ccl_get_feature_sse(7) - mean[9]);
+  if (use_time) {
+    scales[6] = fabs(t - mean[10]);
+  }
 
-	for(int i = 0; i < (use_time? 7 : 6); i++)
-		scales[i] = mask(active_pixels, scales[i]);
+  for (int i = 0; i < (use_time ? 7 : 6); i++)
+    scales[i] = mask(active_pixels, scales[i]);
 }
 
 ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
 {
-	scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
-	scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
-	scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
-	if(use_time) {
-		scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));;
-	}
-	scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
-	scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
-	scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
+  scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+  scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+  scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+  if (use_time) {
+    scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));
+    ;
+  }
+  scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+  scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+  scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
 }
 
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 9eb3c603a4a..a94266a8786 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -16,10 +16,11 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs))))
-#define load4_u(buf, ofs) load_float4((buf)+(ofs))
+#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs))))
+#define load4_u(buf, ofs) load_float4((buf) + (ofs))
 
-ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx,
+                                                         int dy,
                                                          const float *ccl_restrict weight_image,
                                                          const float *ccl_restrict variance_image,
                                                          const float *ccl_restrict scale_image,
@@ -31,122 +32,117 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          float a,
                                                          float k_2)
 {
-	/* Strides need to be aligned to 16 bytes. */
-	kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
-
-	int aligned_lowx = rect.x & (~3);
-	const int numChannels = (channel_offset > 0)? 3 : 1;
-	const float4 channel_fac = make_float4(1.0f / numChannels);
-
-	for(int y = rect.y; y < rect.w; y++) {
-		int idx_p = y*stride + aligned_lowx;
-		int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset;
-		for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
-			float4 diff = make_float4(0.0f);
-			float4 scale_fac;
-			if(scale_image) {
-				scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
-				                  make_float4(0.25f), make_float4(4.0f));
-			}
-			else {
-				scale_fac = make_float4(1.0f);
-			}
-			for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
-				/* idx_p is guaranteed to be aligned, but idx_q isn't. */
-				float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
-				float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs);
-				float4 cdiff = color_p - color_q;
-				float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
-				float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs);
-				diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
-			}
-			load4_a(difference_image, idx_p) = diff*channel_fac;
-		}
-	}
+  /* Strides need to be aligned to 16 bytes. */
+  kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
+
+  int aligned_lowx = rect.x & (~3);
+  const int numChannels = (channel_offset > 0) ? 3 : 1;
+  const float4 channel_fac = make_float4(1.0f / numChannels);
+
+  for (int y = rect.y; y < rect.w; y++) {
+    int idx_p = y * stride + aligned_lowx;
+    int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset;
+    for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
+      float4 diff = make_float4(0.0f);
+      float4 scale_fac;
+      if (scale_image) {
+        scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
+                          make_float4(0.25f),
+                          make_float4(4.0f));
+      }
+      else {
+        scale_fac = make_float4(1.0f);
+      }
+      for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
+        /* idx_p is guaranteed to be aligned, but idx_q isn't. */
+        float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
+        float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs);
+        float4 cdiff = color_p - color_q;
+        float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
+        float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs);
+        diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) /
+                (make_float4(1e-8f) + k_2 * (var_p + var_q));
+      }
+      load4_a(difference_image, idx_p) = diff * channel_fac;
+    }
+  }
 }
 
-ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
-                                              float *out_image,
-                                              int4 rect,
-                                              int stride,
-                                              int f)
+ccl_device_inline void kernel_filter_nlm_blur(
+    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
 {
-	int aligned_lowx = round_down(rect.x, 4);
-	for(int y = rect.y; y < rect.w; y++) {
-		const int low = max(rect.y, y-f);
-		const int high = min(rect.w, y+f+1);
-		for(int x = aligned_lowx; x < rect.z; x += 4) {
-			load4_a(out_image, y*stride + x) = make_float4(0.0f);
-		}
-		for(int y1 = low; y1 < high; y1++) {
-			for(int x = aligned_lowx; x < rect.z; x += 4) {
-				load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x);
-			}
-		}
-		float fac = 1.0f/(high - low);
-		for(int x = aligned_lowx; x < rect.z; x += 4) {
-			load4_a(out_image, y*stride + x) *= fac;
-		}
-	}
+  int aligned_lowx = round_down(rect.x, 4);
+  for (int y = rect.y; y < rect.w; y++) {
+    const int low = max(rect.y, y - f);
+    const int high = min(rect.w, y + f + 1);
+    for (int x = aligned_lowx; x < rect.z; x += 4) {
+      load4_a(out_image, y * stride + x) = make_float4(0.0f);
+    }
+    for (int y1 = low; y1 < high; y1++) {
+      for (int x = aligned_lowx; x < rect.z; x += 4) {
+        load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x);
+      }
+    }
+    float fac = 1.0f / (high - low);
+    for (int x = aligned_lowx; x < rect.z; x += 4) {
+      load4_a(out_image, y * stride + x) *= fac;
+    }
+  }
 }
 
-ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image,
-                                           float *out_image,
-                                           int4 rect,
-                                           int stride,
-                                           int f)
+ccl_device_inline void nlm_blur_horizontal(
+    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
 {
-	int aligned_lowx = round_down(rect.x, 4);
-	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = aligned_lowx; x < rect.z; x += 4) {
-			load4_a(out_image, y*stride + x) = make_float4(0.0f);
-		}
-	}
-
-	for(int dx = -f; dx <= f; dx++) {
-		aligned_lowx = round_down(rect.x - min(0, dx), 4);
-		int highx = rect.z - max(0, dx);
-		int4 lowx4 = make_int4(rect.x - min(0, dx));
-		int4 highx4 = make_int4(rect.z - max(0, dx));
-		for(int y = rect.y; y < rect.w; y++) {
-			for(int x = aligned_lowx; x < highx; x += 4) {
-				int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-				int4 active = (x4 >= lowx4) & (x4 < highx4);
-
-				float4 diff = load4_u(difference_image, y*stride + x + dx);
-				load4_a(out_image, y*stride + x) += mask(active, diff);
-			}
-		}
-	}
-
-	aligned_lowx = round_down(rect.x, 4);
-	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = aligned_lowx; x < rect.z; x += 4) {
-			float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
-			float4 low = max(make_float4(rect.x), x4 - make_float4(f));
-			float4 high = min(make_float4(rect.z), x4 + make_float4(f+1));
-			load4_a(out_image, y*stride + x) *= rcp(high - low);
-		}
-	}
+  int aligned_lowx = round_down(rect.x, 4);
+  for (int y = rect.y; y < rect.w; y++) {
+    for (int x = aligned_lowx; x < rect.z; x += 4) {
+      load4_a(out_image, y * stride + x) = make_float4(0.0f);
+    }
+  }
+
+  for (int dx = -f; dx <= f; dx++) {
+    aligned_lowx = round_down(rect.x - min(0, dx), 4);
+    int highx = rect.z - max(0, dx);
+    int4 lowx4 = make_int4(rect.x - min(0, dx));
+    int4 highx4 = make_int4(rect.z - max(0, dx));
+    for (int y = rect.y; y < rect.w; y++) {
+      for (int x = aligned_lowx; x < highx; x += 4) {
+        int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+        int4 active = (x4 >= lowx4) & (x4 < highx4);
+
+        float4 diff = load4_u(difference_image, y * stride + x + dx);
+        load4_a(out_image, y * stride + x) += mask(active, diff);
+      }
+    }
+  }
+
+  aligned_lowx = round_down(rect.x, 4);
+  for (int y = rect.y; y < rect.w; y++) {
+    for (int x = aligned_lowx; x < rect.z; x += 4) {
+      float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
+      float4 low = max(make_float4(rect.x), x4 - make_float4(f));
+      float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1));
+      load4_a(out_image, y * stride + x) *= rcp(high - low);
+    }
+  }
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                                     float *out_image,
-                                                     int4 rect,
-                                                     int stride,
-                                                     int f)
+ccl_device_inline void kernel_filter_nlm_calc_weight(
+    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
 {
-	nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
-
-	int aligned_lowx = round_down(rect.x, 4);
-	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = aligned_lowx; x < rect.z; x += 4) {
-			load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f)));
-		}
-	}
+  nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
+
+  int aligned_lowx = round_down(rect.x, 4);
+  for (int y = rect.y; y < rect.w; y++) {
+    for (int x = aligned_lowx; x < rect.z; x += 4) {
+      load4_a(out_image, y * stride + x) = fast_expf4(
+          -max(load4_a(out_image, y * stride + x), make_float4(0.0f)));
+    }
+  }
 }
 
-ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
+ccl_device_inline void kernel_filter_nlm_update_output(int dx,
+                                                       int dy,
                                                        const float *ccl_restrict difference_image,
                                                        const float *ccl_restrict image,
                                                        float *temp_image,
@@ -157,33 +153,36 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                                                        int stride,
                                                        int f)
 {
-	nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
+  nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
 
-	int aligned_lowx = round_down(rect.x, 4);
-	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = aligned_lowx; x < rect.z; x += 4) {
-			int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-			int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
+  int aligned_lowx = round_down(rect.x, 4);
+  for (int y = rect.y; y < rect.w; y++) {
+    for (int x = aligned_lowx; x < rect.z; x += 4) {
+      int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+      int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
 
-			int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
+      int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
 
-			float4 weight = load4_a(temp_image, idx_p);
-			load4_a(accum_image, idx_p) += mask(active, weight);
+      float4 weight = load4_a(temp_image, idx_p);
+      load4_a(accum_image, idx_p) += mask(active, weight);
 
-			float4 val = load4_u(image, idx_q);
-			if(channel_offset) {
-				val += load4_u(image, idx_q + channel_offset);
-				val += load4_u(image, idx_q + 2*channel_offset);
-				val *= 1.0f/3.0f;
-			}
+      float4 val = load4_u(image, idx_q);
+      if (channel_offset) {
+        val += load4_u(image, idx_q + channel_offset);
+        val += load4_u(image, idx_q + 2 * channel_offset);
+        val *= 1.0f / 3.0f;
+      }
 
-			load4_a(out_image, idx_p) += mask(active, weight*val);
-		}
-	}
+      load4_a(out_image, idx_p) += mask(active, weight * val);
+    }
+  }
 }
 
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t,
-                                                           const float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx,
+                                                           int dy,
+                                                           int t,
+                                                           const float *ccl_restrict
+                                                               difference_image,
                                                            const float *ccl_restrict buffer,
                                                            float *transform,
                                                            int *rank,
@@ -191,40 +190,49 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t
                                                            float3 *XtWY,
                                                            int4 rect,
                                                            int4 filter_window,
-                                                           int stride, int f,
+                                                           int stride,
+                                                           int f,
                                                            int pass_stride,
                                                            int frame_offset,
                                                            bool use_time)
 {
-	int4 clip_area = rect_clip(rect, filter_window);
-	/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
-	for(int y = clip_area.y; y < clip_area.w; y++) {
-		for(int x = clip_area.x; x < clip_area.z; x++) {
-			const int low = max(rect.x, x-f);
-			const int high = min(rect.z, x+f+1);
-			float sum = 0.0f;
-			for(int x1 = low; x1 < high; x1++) {
-				sum += difference_image[y*stride + x1];
-			}
-			float weight = sum * (1.0f/(high - low));
-
-			int storage_ofs = coord_to_local_index(filter_window, x, y);
-			float  *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
-			float  *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
-			float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
-			int    *l_rank = rank + storage_ofs;
-
-			kernel_filter_construct_gramian(x, y, 1,
-			                                dx, dy, t,
-			                                stride,
-			                                pass_stride,
-			                                frame_offset,
-			                                use_time,
-			                                buffer,
-			                                l_transform, l_rank,
-			                                weight, l_XtWX, l_XtWY, 0);
-		}
-	}
+  int4 clip_area = rect_clip(rect, filter_window);
+  /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+  for (int y = clip_area.y; y < clip_area.w; y++) {
+    for (int x = clip_area.x; x < clip_area.z; x++) {
+      const int low = max(rect.x, x - f);
+      const int high = min(rect.z, x + f + 1);
+      float sum = 0.0f;
+      for (int x1 = low; x1 < high; x1++) {
+        sum += difference_image[y * stride + x1];
+      }
+      float weight = sum * (1.0f / (high - low));
+
+      int storage_ofs = coord_to_local_index(filter_window, x, y);
+      float *l_transform = transform + storage_ofs * TRANSFORM_SIZE;
+      float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE;
+      float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE;
+      int *l_rank = rank + storage_ofs;
+
+      kernel_filter_construct_gramian(x,
+                                      y,
+                                      1,
+                                      dx,
+                                      dy,
+                                      t,
+                                      stride,
+                                      pass_stride,
+                                      frame_offset,
+                                      use_time,
+                                      buffer,
+                                      l_transform,
+                                      l_rank,
+                                      weight,
+                                      l_XtWX,
+                                      l_XtWY,
+                                      0);
+    }
+  }
 }
 
 ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
@@ -232,11 +240,11 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
                                                    int4 rect,
                                                    int w)
 {
-	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = rect.x; x < rect.z; x++) {
-			out_image[y*w+x] /= accum_image[y*w+x];
-		}
-	}
+  for (int y = rect.y; y < rect.w; y++) {
+    for (int x = rect.x; x < rect.z; x++) {
+      out_image[y * w + x] /= accum_image[y * w + x];
+    }
+  }
 }
 
 #undef load4_a
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 12636393243..650c743f34f 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -24,203 +24,232 @@ CCL_NAMESPACE_BEGIN
  * Window is the rect that should be processed.
  * co is filled with (x, y, dx, dy).
  */
-ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
-                                             int4 *rect, int4 *co, int *ofs,
-                                             int4 window)
+ccl_device_inline bool get_nlm_coords_window(
+    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window)
 {
-	/* Determine the pixel offset that this thread should apply. */
-	int s = 2*r+1;
-	int si = ccl_global_id(1);
-	int sx = si % s;
-	int sy = si / s;
-	if(sy >= s) {
-		return false;
-	}
-
-	/* Pixels still need to lie inside the denoising buffer after applying the offset,
-	 * so determine the area for which this is the case. */
-	int dx = sx - r;
-	int dy = sy - r;
-
-	*rect = make_int4(max(0, -dx),     max(0, -dy),
-	              w - max(0,  dx), h - max(0,  dy));
-
-	/* Find the intersection of the area that we want to process (window) and the area
-	 * that can be processed (rect) to get the final area for this offset. */
-	int4 clip_area = rect_clip(window, *rect);
-
-	/* If the radius is larger than one of the sides of the window,
-	 * there will be shifts for which there is no usable pixel at all. */
-	if(!rect_is_valid(clip_area)) {
-		return false;
-	}
-
-	/* Map the linear thread index to pixels inside the clip area. */
-	int x, y;
-	if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
-		return false;
-	}
-
-	*co = make_int4(x, y, dx, dy);
-
-	*ofs = (sy*s + sx) * stride;
-
-	return true;
+  /* Determine the pixel offset that this thread should apply. */
+  int s = 2 * r + 1;
+  int si = ccl_global_id(1);
+  int sx = si % s;
+  int sy = si / s;
+  if (sy >= s) {
+    return false;
+  }
+
+  /* Pixels still need to lie inside the denoising buffer after applying the offset,
+   * so determine the area for which this is the case. */
+  int dx = sx - r;
+  int dy = sy - r;
+
+  *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy));
+
+  /* Find the intersection of the area that we want to process (window) and the area
+   * that can be processed (rect) to get the final area for this offset. */
+  int4 clip_area = rect_clip(window, *rect);
+
+  /* If the radius is larger than one of the sides of the window,
+   * there will be shifts for which there is no usable pixel at all. */
+  if (!rect_is_valid(clip_area)) {
+    return false;
+  }
+
+  /* Map the linear thread index to pixels inside the clip area. */
+  int x, y;
+  if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
+    return false;
+  }
+
+  *co = make_int4(x, y, dx, dy);
+
+  *ofs = (sy * s + sx) * stride;
+
+  return true;
 }
 
-ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride,
-                                      int4 *rect, int4 *co, int *ofs)
+ccl_device_inline bool get_nlm_coords(
+    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs)
 {
-	return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
+  return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
-                                                         int dx, int dy,
-                                                         const ccl_global float *ccl_restrict weight_image,
-                                                         const ccl_global float *ccl_restrict variance_image,
-                                                         const ccl_global float *ccl_restrict scale_image,
-                                                         ccl_global float *difference_image,
-                                                         int4 rect, int stride,
-                                                         int channel_offset,
-                                                         int frame_offset,
-                                                         float a, float k_2)
+ccl_device_inline void kernel_filter_nlm_calc_difference(
+    int x,
+    int y,
+    int dx,
+    int dy,
+    const ccl_global float *ccl_restrict weight_image,
+    const ccl_global float *ccl_restrict variance_image,
+    const ccl_global float *ccl_restrict scale_image,
+    ccl_global float *difference_image,
+    int4 rect,
+    int stride,
+    int channel_offset,
+    int frame_offset,
+    float a,
+    float k_2)
 {
-	int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset;
-	int numChannels = channel_offset? 3 : 1;
-
-	float diff = 0.0f;
-	float scale_fac = 1.0f;
-	if(scale_image) {
-		scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
-	}
-
-	for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
-		float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q];
-		float pvar = variance_image[idx_p];
-		float qvar = sqr(scale_fac)*variance_image[idx_q];
-		diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
-	}
-	if(numChannels > 1) {
-		diff *= 1.0f/numChannels;
-	}
-	difference_image[y*stride + x] = diff;
+  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset;
+  int numChannels = channel_offset ? 3 : 1;
+
+  float diff = 0.0f;
+  float scale_fac = 1.0f;
+  if (scale_image) {
+    scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
+  }
+
+  for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
+    float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q];
+    float pvar = variance_image[idx_p];
+    float qvar = sqr(scale_fac) * variance_image[idx_q];
+    diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar));
+  }
+  if (numChannels > 1) {
+    diff *= 1.0f / numChannels;
+  }
+  difference_image[y * stride + x] = diff;
 }
 
-ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
-                                              const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_blur(int x,
+                                              int y,
+                                              const ccl_global float *ccl_restrict
+                                                  difference_image,
                                               ccl_global float *out_image,
-                                              int4 rect, int stride, int f)
+                                              int4 rect,
+                                              int stride,
+                                              int f)
 {
-	float sum = 0.0f;
-	const int low = max(rect.y, y-f);
-	const int high = min(rect.w, y+f+1);
-	for(int y1 = low; y1 < high; y1++) {
-		sum += difference_image[y1*stride + x];
-	}
-	sum *= 1.0f/(high-low);
-	out_image[y*stride + x] = sum;
+  float sum = 0.0f;
+  const int low = max(rect.y, y - f);
+  const int high = min(rect.w, y + f + 1);
+  for (int y1 = low; y1 < high; y1++) {
+    sum += difference_image[y1 * stride + x];
+  }
+  sum *= 1.0f / (high - low);
+  out_image[y * stride + x] = sum;
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
-                                                     const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x,
+                                                     int y,
+                                                     const ccl_global float *ccl_restrict
+                                                         difference_image,
                                                      ccl_global float *out_image,
-                                                     int4 rect, int stride, int f)
+                                                     int4 rect,
+                                                     int stride,
+                                                     int f)
 {
-	float sum = 0.0f;
-	const int low = max(rect.x, x-f);
-	const int high = min(rect.z, x+f+1);
-	for(int x1 = low; x1 < high; x1++) {
-		sum += difference_image[y*stride + x1];
-	}
-	sum *= 1.0f/(high-low);
-	out_image[y*stride + x] = fast_expf(-max(sum, 0.0f));
+  float sum = 0.0f;
+  const int low = max(rect.x, x - f);
+  const int high = min(rect.z, x + f + 1);
+  for (int x1 = low; x1 < high; x1++) {
+    sum += difference_image[y * stride + x1];
+  }
+  sum *= 1.0f / (high - low);
+  out_image[y * stride + x] = fast_expf(-max(sum, 0.0f));
 }
 
-ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
-                                                       int dx, int dy,
-                                                       const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_update_output(int x,
+                                                       int y,
+                                                       int dx,
+                                                       int dy,
+                                                       const ccl_global float *ccl_restrict
+                                                           difference_image,
                                                        const ccl_global float *ccl_restrict image,
                                                        ccl_global float *out_image,
                                                        ccl_global float *accum_image,
-                                                       int4 rect, int channel_offset,
-                                                       int stride, int f)
+                                                       int4 rect,
+                                                       int channel_offset,
+                                                       int stride,
+                                                       int f)
 {
-	float sum = 0.0f;
-	const int low = max(rect.x, x-f);
-	const int high = min(rect.z, x+f+1);
-	for(int x1 = low; x1 < high; x1++) {
-		sum += difference_image[y*stride + x1];
-	}
-	sum *= 1.0f/(high-low);
-
-	int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
-	if(out_image) {
-		atomic_add_and_fetch_float(accum_image + idx_p, sum);
-
-		float val = image[idx_q];
-		if(channel_offset) {
-			val += image[idx_q + channel_offset];
-			val += image[idx_q + 2*channel_offset];
-			val *= 1.0f/3.0f;
-		}
-		atomic_add_and_fetch_float(out_image + idx_p, sum*val);
-	}
-	else {
-		accum_image[idx_p] = sum;
-	}
+  float sum = 0.0f;
+  const int low = max(rect.x, x - f);
+  const int high = min(rect.z, x + f + 1);
+  for (int x1 = low; x1 < high; x1++) {
+    sum += difference_image[y * stride + x1];
+  }
+  sum *= 1.0f / (high - low);
+
+  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
+  if (out_image) {
+    atomic_add_and_fetch_float(accum_image + idx_p, sum);
+
+    float val = image[idx_q];
+    if (channel_offset) {
+      val += image[idx_q + channel_offset];
+      val += image[idx_q + 2 * channel_offset];
+      val *= 1.0f / 3.0f;
+    }
+    atomic_add_and_fetch_float(out_image + idx_p, sum * val);
+  }
+  else {
+    accum_image[idx_p] = sum;
+  }
 }
 
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
-                                                           int dx, int dy, int t,
-                                                           const ccl_global float *ccl_restrict difference_image,
-                                                           const ccl_global float *ccl_restrict buffer,
-                                                           const ccl_global float *ccl_restrict transform,
-                                                           ccl_global int *rank,
-                                                           ccl_global float *XtWX,
-                                                           ccl_global float3 *XtWY,
-                                                           int4 rect,
-                                                           int4 filter_window,
-                                                           int stride, int f,
-                                                           int pass_stride,
-                                                           int frame_offset,
-                                                           bool use_time,
-                                                           int localIdx)
+ccl_device_inline void kernel_filter_nlm_construct_gramian(
+    int x,
+    int y,
+    int dx,
+    int dy,
+    int t,
+    const ccl_global float *ccl_restrict difference_image,
+    const ccl_global float *ccl_restrict buffer,
+    const ccl_global float *ccl_restrict transform,
+    ccl_global int *rank,
+    ccl_global float *XtWX,
+    ccl_global float3 *XtWY,
+    int4 rect,
+    int4 filter_window,
+    int stride,
+    int f,
+    int pass_stride,
+    int frame_offset,
+    bool use_time,
+    int localIdx)
 {
-	const int low = max(rect.x, x-f);
-	const int high = min(rect.z, x+f+1);
-	float sum = 0.0f;
-	for(int x1 = low; x1 < high; x1++) {
-		sum += difference_image[y*stride + x1];
-	}
-	float weight = sum * (1.0f/(high - low));
-
-	/* Reconstruction data is only stored for pixels inside the filter window,
-	 * so compute the pixels's index in there. */
-	int storage_ofs = coord_to_local_index(filter_window, x, y);
-	transform += storage_ofs;
-	rank += storage_ofs;
-	XtWX += storage_ofs;
-	XtWY += storage_ofs;
-
-	kernel_filter_construct_gramian(x, y,
-	                                rect_size(filter_window),
-	                                dx, dy, t,
-	                                stride,
-	                                pass_stride,
-	                                frame_offset,
-	                                use_time,
-	                                buffer,
-	                                transform, rank,
-	                                weight, XtWX, XtWY,
-	                                localIdx);
+  const int low = max(rect.x, x - f);
+  const int high = min(rect.z, x + f + 1);
+  float sum = 0.0f;
+  for (int x1 = low; x1 < high; x1++) {
+    sum += difference_image[y * stride + x1];
+  }
+  float weight = sum * (1.0f / (high - low));
+
+  /* Reconstruction data is only stored for pixels inside the filter window,
+   * so compute the pixels's index in there. */
+  int storage_ofs = coord_to_local_index(filter_window, x, y);
+  transform += storage_ofs;
+  rank += storage_ofs;
+  XtWX += storage_ofs;
+  XtWY += storage_ofs;
+
+  kernel_filter_construct_gramian(x,
+                                  y,
+                                  rect_size(filter_window),
+                                  dx,
+                                  dy,
+                                  t,
+                                  stride,
+                                  pass_stride,
+                                  frame_offset,
+                                  use_time,
+                                  buffer,
+                                  transform,
+                                  rank,
+                                  weight,
+                                  XtWX,
+                                  XtWY,
+                                  localIdx);
 }
 
-ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+ccl_device_inline void kernel_filter_nlm_normalize(int x,
+                                                   int y,
                                                    ccl_global float *out_image,
-                                                   const ccl_global float *ccl_restrict accum_image,
+                                                   const ccl_global float *ccl_restrict
+                                                       accum_image,
                                                    int stride)
 {
-	out_image[y*stride + x] /= accum_image[y*stride + x];
+  out_image[y * stride + x] /= accum_image[y * stride + x];
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index e24f4feb28d..8211311313d 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -27,7 +27,8 @@ CCL_NAMESPACE_BEGIN
  */
 ccl_device void kernel_filter_divide_shadow(int sample,
                                             CCL_FILTER_TILE_INFO,
-                                            int x, int y,
+                                            int x,
+                                            int y,
                                             ccl_global float *unfilteredA,
                                             ccl_global float *unfilteredB,
                                             ccl_global float *sampleVariance,
@@ -37,37 +38,39 @@ ccl_device void kernel_filter_divide_shadow(int sample,
                                             int buffer_pass_stride,
                                             int buffer_denoising_offset)
 {
-	int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2);
-	int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2);
-	int tile = ytile*3+xtile;
+  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
+  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
+  int tile = ytile * 3 + xtile;
 
-	int offset = tile_info->offsets[tile];
-	int stride = tile_info->strides[tile];
-	const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) ccl_get_tile_buffer(tile);
-	center_buffer += (y*stride + x + offset)*buffer_pass_stride;
-	center_buffer += buffer_denoising_offset + 14;
+  int offset = tile_info->offsets[tile];
+  int stride = tile_info->strides[tile];
+  const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer(
+      tile);
+  center_buffer += (y * stride + x + offset) * buffer_pass_stride;
+  center_buffer += buffer_denoising_offset + 14;
 
-	int buffer_w = align_up(rect.z - rect.x, 4);
-	int idx = (y-rect.y)*buffer_w + (x - rect.x);
-	unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
-	unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+  int buffer_w = align_up(rect.z - rect.x, 4);
+  int idx = (y - rect.y) * buffer_w + (x - rect.x);
+  unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+  unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
 
-	float varA = center_buffer[2];
-	float varB = center_buffer[5];
-	int odd_sample = (sample+1)/2;
-	int even_sample = sample/2;
+  float varA = center_buffer[2];
+  float varB = center_buffer[5];
+  int odd_sample = (sample + 1) / 2;
+  int even_sample = sample / 2;
 
-	/* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-	 * update does not work efficiently with atomics in the kernel. */
-	varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
-	varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+  /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+   * update does not work efficiently with atomics in the kernel. */
+  varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample);
+  varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample);
 
-	varA /= max(odd_sample - 1, 1);
-	varB /= max(even_sample - 1, 1);
+  varA /= max(odd_sample - 1, 1);
+  varB /= max(even_sample - 1, 1);
 
-	sampleVariance[idx]  = 0.5f*(varA + varB) / sample;
-	sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
-	bufferVariance[idx]  = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+  sampleVariance[idx] = 0.5f * (varA + varB) / sample;
+  sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample);
+  bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) *
+                        (unfilteredA[idx] - unfilteredB[idx]);
 }
 
 /* Load a regular feature from the render buffers into the denoise buffer.
@@ -80,55 +83,65 @@ ccl_device void kernel_filter_divide_shadow(int sample,
  */
 ccl_device void kernel_filter_get_feature(int sample,
                                           CCL_FILTER_TILE_INFO,
-                                          int m_offset, int v_offset,
-                                          int x, int y,
+                                          int m_offset,
+                                          int v_offset,
+                                          int x,
+                                          int y,
                                           ccl_global float *mean,
                                           ccl_global float *variance,
                                           float scale,
-                                          int4 rect, int buffer_pass_stride,
+                                          int4 rect,
+                                          int buffer_pass_stride,
                                           int buffer_denoising_offset)
 {
-	int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2);
-	int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2);
-	int tile = ytile*3+xtile;
-	ccl_global float *center_buffer = ((ccl_global float*) ccl_get_tile_buffer(tile)) + (tile_info->offsets[tile] + y*tile_info->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
+  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
+  int tile = ytile * 3 + xtile;
+  ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) +
+                                    (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) *
+                                        buffer_pass_stride +
+                                    buffer_denoising_offset;
 
-	int buffer_w = align_up(rect.z - rect.x, 4);
-	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+  int buffer_w = align_up(rect.z - rect.x, 4);
+  int idx = (y - rect.y) * buffer_w + (x - rect.x);
 
-	float val = scale * center_buffer[m_offset];
-	mean[idx] = val;
+  float val = scale * center_buffer[m_offset];
+  mean[idx] = val;
 
-	if(v_offset >= 0) {
-		if(sample > 1) {
-			/* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-			 * update does not work efficiently with atomics in the kernel. */
-			variance[idx] = max(0.0f, (center_buffer[v_offset] - val*val*sample) / (sample * (sample-1)));
-		}
-		else {
-			/* Can't compute variance with single sample, just set it very high. */
-			variance[idx] = 1e10f;
-		}
-	}
+  if (v_offset >= 0) {
+    if (sample > 1) {
+      /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+       * update does not work efficiently with atomics in the kernel. */
+      variance[idx] = max(
+          0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1)));
+    }
+    else {
+      /* Can't compute variance with single sample, just set it very high. */
+      variance[idx] = 1e10f;
+    }
+  }
 }
 
 ccl_device void kernel_filter_write_feature(int sample,
-                                            int x, int y,
+                                            int x,
+                                            int y,
                                             int4 buffer_params,
                                             ccl_global float *from,
                                             ccl_global float *buffer,
                                             int out_offset,
                                             int4 rect)
 {
-	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
+                                                   buffer_params.z;
 
-	int buffer_w = align_up(rect.z - rect.x, 4);
-	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+  int buffer_w = align_up(rect.z - rect.x, 4);
+  int idx = (y - rect.y) * buffer_w + (x - rect.x);
 
-	combined_buffer[out_offset] = from[idx];
+  combined_buffer[out_offset] = from[idx];
 }
 
-ccl_device void kernel_filter_detect_outliers(int x, int y,
+ccl_device void kernel_filter_detect_outliers(int x,
+                                              int y,
                                               ccl_global float *image,
                                               ccl_global float *variance,
                                               ccl_global float *depth,
@@ -136,123 +149,131 @@ ccl_device void kernel_filter_detect_outliers(int x, int y,
                                               int4 rect,
                                               int pass_stride)
 {
-	int buffer_w = align_up(rect.z - rect.x, 4);
+  int buffer_w = align_up(rect.z - rect.x, 4);
 
-	int n = 0;
-	float values[25];
-	float pixel_variance, max_variance = 0.0f;
-	for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
-		for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
-			int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
-			float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
-			color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-			float L = average(color);
+  int n = 0;
+  float values[25];
+  float pixel_variance, max_variance = 0.0f;
+  for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) {
+    for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) {
+      int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x);
+      float3 color = make_float3(
+          image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]);
+      color = max(color, make_float3(0.0f, 0.0f, 0.0f));
+      float L = average(color);
 
-			/* Find the position of L. */
-			int i;
-			for(i = 0; i < n; i++) {
-				if(values[i] > L) break;
-			}
-			/* Make space for L by shifting all following values to the right. */
-			for(int j = n; j > i; j--) {
-				values[j] = values[j-1];
-			}
-			/* Insert L. */
-			values[i] = L;
-			n++;
+      /* Find the position of L. */
+      int i;
+      for (i = 0; i < n; i++) {
+        if (values[i] > L)
+          break;
+      }
+      /* Make space for L by shifting all following values to the right. */
+      for (int j = n; j > i; j--) {
+        values[j] = values[j - 1];
+      }
+      /* Insert L. */
+      values[i] = L;
+      n++;
 
-			float3 pixel_var = make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]);
-			float var = average(pixel_var);
-			if((x1 == x) && (y1 == y)) {
-				pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f)? -1.0f : var;
-			}
-			else {
-				max_variance = max(max_variance, var);
-			}
-		}
-	}
+      float3 pixel_var = make_float3(
+          variance[idx], variance[idx + pass_stride], variance[idx + 2 * pass_stride]);
+      float var = average(pixel_var);
+      if ((x1 == x) && (y1 == y)) {
+        pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f :
+                                                                                            var;
+      }
+      else {
+        max_variance = max(max_variance, var);
+      }
+    }
+  }
 
-	max_variance += 1e-4f;
+  max_variance += 1e-4f;
 
-	int idx = (y-rect.y)*buffer_w + (x-rect.x);
-	float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
-	color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-	float L = average(color);
+  int idx = (y - rect.y) * buffer_w + (x - rect.x);
+  float3 color = make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]);
+  color = max(color, make_float3(0.0f, 0.0f, 0.0f));
+  float L = average(color);
 
-	float ref = 2.0f*values[(int)(n*0.75f)];
+  float ref = 2.0f * values[(int)(n * 0.75f)];
 
-	/* Slightly offset values to avoid false positives in (almost) black areas. */
-	max_variance += 1e-5f;
-	ref -= 1e-5f;
+  /* Slightly offset values to avoid false positives in (almost) black areas. */
+  max_variance += 1e-5f;
+  ref -= 1e-5f;
 
-	if(L > ref) {
-		/* The pixel appears to be an outlier.
-		 * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
-		 * should actually be at the reference value:
-		 * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
-		 * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
-		 */
+  if (L > ref) {
+    /* The pixel appears to be an outlier.
+     * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+     * should actually be at the reference value:
+     * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+     * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+     */
 
-		if(pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
-			depth[idx] = -depth[idx];
-			color *= ref/L;
-			variance[idx] = variance[idx + pass_stride] = variance[idx + 2*pass_stride] = max_variance;
-		}
-		else {
-			float stddev = sqrtf(pixel_variance);
-			if(L - 3*stddev < ref) {
-				/* The pixel is an outlier, so negate the depth value to mark it as one.
-				* Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
-				depth[idx] = -depth[idx];
-				float fac = ref/L;
-				color *= fac;
-				variance[idx              ] *= fac*fac;
-				variance[idx + pass_stride] *= fac*fac;
-				variance[idx+2*pass_stride] *= fac*fac;
-			}
-		}
-	}
-	out[idx              ] = color.x;
-	out[idx + pass_stride] = color.y;
-	out[idx+2*pass_stride] = color.z;
+    if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
+      depth[idx] = -depth[idx];
+      color *= ref / L;
+      variance[idx] = variance[idx + pass_stride] = variance[idx + 2 * pass_stride] = max_variance;
+    }
+    else {
+      float stddev = sqrtf(pixel_variance);
+      if (L - 3 * stddev < ref) {
+        /* The pixel is an outlier, so negate the depth value to mark it as one.
+        * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+        depth[idx] = -depth[idx];
+        float fac = ref / L;
+        color *= fac;
+        variance[idx] *= fac * fac;
+        variance[idx + pass_stride] *= fac * fac;
+        variance[idx + 2 * pass_stride] *= fac * fac;
+      }
+    }
+  }
+  out[idx] = color.x;
+  out[idx + pass_stride] = color.y;
+  out[idx + 2 * pass_stride] = color.z;
 }
 
 /* Combine A/B buffers.
  * Calculates the combined mean and the buffer variance. */
-ccl_device void kernel_filter_combine_halves(int x, int y,
+ccl_device void kernel_filter_combine_halves(int x,
+                                             int y,
                                              ccl_global float *mean,
                                              ccl_global float *variance,
                                              ccl_global float *a,
                                              ccl_global float *b,
-                                             int4 rect, int r)
+                                             int4 rect,
+                                             int r)
 {
-	int buffer_w = align_up(rect.z - rect.x, 4);
-	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+  int buffer_w = align_up(rect.z - rect.x, 4);
+  int idx = (y - rect.y) * buffer_w + (x - rect.x);
 
-	if(mean)     mean[idx] = 0.5f * (a[idx]+b[idx]);
-	if(variance) {
-		if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
-		else {
-			variance[idx] = 0.0f;
-			float values[25];
-			int numValues = 0;
-			for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
-				for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
-					int pidx = (py-rect.y)*buffer_w + (px-rect.x);
-					values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
-				}
-			}
-			/* Insertion-sort the variances (fast enough for 25 elements). */
-			for(int i = 1; i < numValues; i++) {
-				float v = values[i];
-				int j;
-				for(j = i-1; j >= 0 && values[j] > v; j--)
-					values[j+1] = values[j];
-				values[j+1] = v;
-			}
-			variance[idx] = values[(7*numValues)/8];
-		}
-	}
+  if (mean)
+    mean[idx] = 0.5f * (a[idx] + b[idx]);
+  if (variance) {
+    if (r == 0)
+      variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]);
+    else {
+      variance[idx] = 0.0f;
+      float values[25];
+      int numValues = 0;
+      for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) {
+        for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) {
+          int pidx = (py - rect.y) * buffer_w + (px - rect.x);
+          values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]);
+        }
+      }
+      /* Insertion-sort the variances (fast enough for 25 elements). */
+      for (int i = 1; i < numValues; i++) {
+        float v = values[i];
+        int j;
+        for (j = i - 1; j >= 0 && values[j] > v; j--)
+          values[j + 1] = values[j];
+        values[j + 1] = v;
+      }
+      variance[idx] = values[(7 * numValues) / 8];
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index ceda8f71f98..850f20584da 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -16,63 +16,75 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+ccl_device_inline void kernel_filter_construct_gramian(int x,
+                                                       int y,
                                                        int storage_stride,
-                                                       int dx, int dy, int t,
+                                                       int dx,
+                                                       int dy,
+                                                       int t,
                                                        int buffer_stride,
                                                        int pass_stride,
                                                        int frame_offset,
                                                        bool use_time,
                                                        const ccl_global float *ccl_restrict buffer,
-                                                       const ccl_global float *ccl_restrict transform,
+                                                       const ccl_global float *ccl_restrict
+                                                           transform,
                                                        ccl_global int *rank,
                                                        float weight,
                                                        ccl_global float *XtWX,
                                                        ccl_global float3 *XtWY,
                                                        int localIdx)
 {
-	if(weight < 1e-3f) {
-		return;
-	}
+  if (weight < 1e-3f) {
+    return;
+  }
 
-	int p_offset =  y     * buffer_stride +  x;
-	int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset;
+  int p_offset = y * buffer_stride + x;
+  int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset;
 
 #ifdef __KERNEL_GPU__
-	const int stride = storage_stride;
+  const int stride = storage_stride;
 #else
-	const int stride = 1;
-	(void) storage_stride;
+  const int stride = 1;
+  (void)storage_stride;
 #endif
 
 #ifdef __KERNEL_CUDA__
-	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
-	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+  ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE];
+  ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1);
 #else
-	float design_row[DENOISE_FEATURES+1];
+  float design_row[DENOISE_FEATURES + 1];
 #endif
 
-	float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
+  float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
 
-	/* If the pixel was flagged as an outlier during prefiltering, skip it. */
-	if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
-		return;
-	}
+  /* If the pixel was flagged as an outlier during prefiltering, skip it. */
+  if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
+    return;
+  }
 
-	filter_get_design_row_transform(make_int3(x, y, t),       buffer + p_offset,
-	                                make_int3(x+dx, y+dy, t), buffer + q_offset,
-	                                pass_stride, *rank, design_row, transform, stride, use_time);
+  filter_get_design_row_transform(make_int3(x, y, t),
+                                  buffer + p_offset,
+                                  make_int3(x + dx, y + dy, t),
+                                  buffer + q_offset,
+                                  pass_stride,
+                                  *rank,
+                                  design_row,
+                                  transform,
+                                  stride,
+                                  use_time);
 
 #ifdef __KERNEL_GPU__
-	math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
-	math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+  math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride);
+  math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride);
 #else
-	math_trimatrix_add_gramian(XtWX, (*rank)+1, design_row, weight);
-	math_vec3_add(XtWY, (*rank)+1, design_row, weight * q_color);
+  math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight);
+  math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color);
 #endif
 }
 
-ccl_device_inline void kernel_filter_finalize(int x, int y,
+ccl_device_inline void kernel_filter_finalize(int x,
+                                              int y,
                                               ccl_global float *buffer,
                                               ccl_global int *rank,
                                               int storage_stride,
@@ -82,47 +94,47 @@ ccl_device_inline void kernel_filter_finalize(int x, int y,
                                               int sample)
 {
 #ifdef __KERNEL_GPU__
-	const int stride = storage_stride;
+  const int stride = storage_stride;
 #else
-	const int stride = 1;
-	(void) storage_stride;
+  const int stride = 1;
+  (void)storage_stride;
 #endif
 
-	if(XtWX[0] < 1e-3f) {
-		/* There is not enough information to determine a denoised result.
-		 * As a fallback, keep the original value of the pixel. */
-		 return;
-	}
-
-	/* The weighted average of pixel colors (essentially, the NLM-filtered image).
-	 * In case the solution of the linear model fails due to numerical issues or
-	 * returns non-sensical negative values, fall back to this value. */
-	float3 mean_color = XtWY[0]/XtWX[0];
-
-	math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
-
-	float3 final_color = XtWY[0];
-	if(!isfinite3_safe(final_color) ||
-	   (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f))
-	{
-		final_color = mean_color;
-	}
-
-	/* Clamp pixel value to positive values. */
-	final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
-
-	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
-	if(buffer_params.w >= 0) {
-		final_color *= sample;
-		if(buffer_params.w > 0) {
-			final_color.x += combined_buffer[buffer_params.w+0];
-			final_color.y += combined_buffer[buffer_params.w+1];
-			final_color.z += combined_buffer[buffer_params.w+2];
-		}
-	}
-	combined_buffer[0] = final_color.x;
-	combined_buffer[1] = final_color.y;
-	combined_buffer[2] = final_color.z;
+  if (XtWX[0] < 1e-3f) {
+    /* There is not enough information to determine a denoised result.
+     * As a fallback, keep the original value of the pixel. */
+    return;
+  }
+
+  /* The weighted average of pixel colors (essentially, the NLM-filtered image).
+   * In case the solution of the linear model fails due to numerical issues or
+   * returns non-sensical negative values, fall back to this value. */
+  float3 mean_color = XtWY[0] / XtWX[0];
+
+  math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride);
+
+  float3 final_color = XtWY[0];
+  if (!isfinite3_safe(final_color) ||
+      (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) {
+    final_color = mean_color;
+  }
+
+  /* Clamp pixel value to positive values. */
+  final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
+  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
+                                                   buffer_params.z;
+  if (buffer_params.w >= 0) {
+    final_color *= sample;
+    if (buffer_params.w > 0) {
+      final_color.x += combined_buffer[buffer_params.w + 0];
+      final_color.y += combined_buffer[buffer_params.w + 1];
+      final_color.z += combined_buffer[buffer_params.w + 2];
+    }
+  }
+  combined_buffer[0] = final_color.x;
+  combined_buffer[1] = final_color.y;
+  combined_buffer[2] = final_color.z;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
index 94e27bb02fd..69e3c7c458d 100644
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -18,92 +18,101 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
                                                   CCL_FILTER_TILE_INFO,
-                                                  int x, int y, int4 rect,
-                                                  int pass_stride, int frame_stride,
+                                                  int x,
+                                                  int y,
+                                                  int4 rect,
+                                                  int pass_stride,
+                                                  int frame_stride,
                                                   bool use_time,
-                                                  float *transform, int *rank,
-                                                  int radius, float pca_threshold)
+                                                  float *transform,
+                                                  int *rank,
+                                                  int radius,
+                                                  float pca_threshold)
 {
-	int buffer_w = align_up(rect.z - rect.x, 4);
-
-	float features[DENOISE_FEATURES];
-
-	const float *ccl_restrict pixel_buffer;
-	int3 pixel;
-
-	int num_features = use_time? 11 : 10;
-
-	/* === Calculate denoising window. === */
-	int2 low  = make_int2(max(rect.x, x - radius),
-	                      max(rect.y, y - radius));
-	int2 high = make_int2(min(rect.z, x + radius + 1),
-	                      min(rect.w, y + radius + 1));
-	int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-	/* === Shift feature passes to have mean 0. === */
-	float feature_means[DENOISE_FEATURES];
-	math_vector_zero(feature_means, num_features);
-	FOR_PIXEL_WINDOW {
-		filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-		math_vector_add(feature_means, features, num_features);
-	} END_FOR_PIXEL_WINDOW
-
-	math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
-	float feature_scale[DENOISE_FEATURES];
-	math_vector_zero(feature_scale, num_features);
-
-	FOR_PIXEL_WINDOW {
-		filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-		math_vector_max(feature_scale, features, num_features);
-	} END_FOR_PIXEL_WINDOW
-
-	filter_calculate_scale(feature_scale, use_time);
-
-	/* === Generate the feature transformation. ===
-	 * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
-	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
-	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
-	math_matrix_zero(feature_matrix, num_features);
-	FOR_PIXEL_WINDOW {
-		filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-		math_vector_mul(features, feature_scale, num_features);
-		math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-	} END_FOR_PIXEL_WINDOW
-
-	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-	*rank = 0;
-	/* Prevent overfitting when a small window is used. */
-	int max_rank = min(num_features, num_pixels/3);
-	if(pca_threshold < 0.0f) {
-		float threshold_energy = 0.0f;
-		for(int i = 0; i < num_features; i++) {
-			threshold_energy += feature_matrix[i*num_features+i];
-		}
-		threshold_energy *= 1.0f - (-pca_threshold);
-
-		float reduced_energy = 0.0f;
-		for(int i = 0; i < max_rank; i++, (*rank)++) {
-			if(i >= 2 && reduced_energy >= threshold_energy)
-				break;
-			float s = feature_matrix[i*num_features+i];
-			reduced_energy += s;
-		}
-	}
-	else {
-		for(int i = 0; i < max_rank; i++, (*rank)++) {
-			float s = feature_matrix[i*num_features+i];
-			if(i >= 2 && sqrtf(s) < pca_threshold)
-				break;
-		}
-	}
-
-	/* Bake the feature scaling into the transformation matrix. */
-	for(int i = 0; i < (*rank); i++) {
-		math_vector_mul(transform + i*num_features, feature_scale, num_features);
-	}
-	math_matrix_transpose(transform, num_features, 1);
+  int buffer_w = align_up(rect.z - rect.x, 4);
+
+  float features[DENOISE_FEATURES];
+
+  const float *ccl_restrict pixel_buffer;
+  int3 pixel;
+
+  int num_features = use_time ? 11 : 10;
+
+  /* === Calculate denoising window. === */
+  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
+  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
+  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
+
+  /* === Shift feature passes to have mean 0. === */
+  float feature_means[DENOISE_FEATURES];
+  math_vector_zero(feature_means, num_features);
+  FOR_PIXEL_WINDOW
+  {
+    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+    math_vector_add(feature_means, features, num_features);
+  }
+  END_FOR_PIXEL_WINDOW
+
+  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
+
+  /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+  float feature_scale[DENOISE_FEATURES];
+  math_vector_zero(feature_scale, num_features);
+
+  FOR_PIXEL_WINDOW
+  {
+    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+    math_vector_max(feature_scale, features, num_features);
+  }
+  END_FOR_PIXEL_WINDOW
+
+  filter_calculate_scale(feature_scale, use_time);
+
+  /* === Generate the feature transformation. ===
+   * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+   * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
+  math_matrix_zero(feature_matrix, num_features);
+  FOR_PIXEL_WINDOW
+  {
+    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+    math_vector_mul(features, feature_scale, num_features);
+    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
+  }
+  END_FOR_PIXEL_WINDOW
+
+  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
+  *rank = 0;
+  /* Prevent overfitting when a small window is used. */
+  int max_rank = min(num_features, num_pixels / 3);
+  if (pca_threshold < 0.0f) {
+    float threshold_energy = 0.0f;
+    for (int i = 0; i < num_features; i++) {
+      threshold_energy += feature_matrix[i * num_features + i];
+    }
+    threshold_energy *= 1.0f - (-pca_threshold);
+
+    float reduced_energy = 0.0f;
+    for (int i = 0; i < max_rank; i++, (*rank)++) {
+      if (i >= 2 && reduced_energy >= threshold_energy)
+        break;
+      float s = feature_matrix[i * num_features + i];
+      reduced_energy += s;
+    }
+  }
+  else {
+    for (int i = 0; i < max_rank; i++, (*rank)++) {
+      float s = feature_matrix[i * num_features + i];
+      if (i >= 2 && sqrtf(s) < pca_threshold)
+        break;
+    }
+  }
+
+  /* Bake the feature scaling into the transformation matrix. */
+  for (int i = 0; i < (*rank); i++) {
+    math_vector_mul(transform + i * num_features, feature_scale, num_features);
+  }
+  math_matrix_transpose(transform, num_features, 1);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index ed8ddcb49b1..89cddfd927f 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -18,106 +18,110 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
                                                   CCL_FILTER_TILE_INFO,
-                                                  int x, int y, int4 rect,
-                                                  int pass_stride, int frame_stride,
+                                                  int x,
+                                                  int y,
+                                                  int4 rect,
+                                                  int pass_stride,
+                                                  int frame_stride,
                                                   bool use_time,
                                                   ccl_global float *transform,
                                                   ccl_global int *rank,
-                                                  int radius, float pca_threshold,
-                                                  int transform_stride, int localIdx)
+                                                  int radius,
+                                                  float pca_threshold,
+                                                  int transform_stride,
+                                                  int localIdx)
 {
-	int buffer_w = align_up(rect.z - rect.x, 4);
+  int buffer_w = align_up(rect.z - rect.x, 4);
 
 #ifdef __KERNEL_CUDA__
-	ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
-	ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+  ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE];
+  ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES;
 #else
-	float features[DENOISE_FEATURES];
+  float features[DENOISE_FEATURES];
 #endif
 
-	int num_features = use_time? 11 : 10;
-
-	/* === Calculate denoising window. === */
-	int2 low  = make_int2(max(rect.x, x - radius),
-	                      max(rect.y, y - radius));
-	int2 high = make_int2(min(rect.z, x + radius + 1),
-	                      min(rect.w, y + radius + 1));
-	int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-	const ccl_global float *ccl_restrict pixel_buffer;
-	int3 pixel;
-
-
-
-
-	/* === Shift feature passes to have mean 0. === */
-	float feature_means[DENOISE_FEATURES];
-	math_vector_zero(feature_means, num_features);
-	FOR_PIXEL_WINDOW {
-		filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-		math_vector_add(feature_means, features, num_features);
-	} END_FOR_PIXEL_WINDOW
-
-	math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
-	float feature_scale[DENOISE_FEATURES];
-	math_vector_zero(feature_scale, num_features);
-
-	FOR_PIXEL_WINDOW {
-		filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-		math_vector_max(feature_scale, features, num_features);
-	} END_FOR_PIXEL_WINDOW
-
-	filter_calculate_scale(feature_scale, use_time);
-
-
-
-	/* === Generate the feature transformation. ===
-	 * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
-	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
-	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
-	math_matrix_zero(feature_matrix, num_features);
-	FOR_PIXEL_WINDOW {
-		filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-		math_vector_mul(features, feature_scale, num_features);
-		math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-	} END_FOR_PIXEL_WINDOW
-
-	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
-	*rank = 0;
-	/* Prevent overfitting when a small window is used. */
-	int max_rank = min(num_features, num_pixels/3);
-	if(pca_threshold < 0.0f) {
-		float threshold_energy = 0.0f;
-		for(int i = 0; i < num_features; i++) {
-			threshold_energy += feature_matrix[i*num_features+i];
-		}
-		threshold_energy *= 1.0f - (-pca_threshold);
-
-		float reduced_energy = 0.0f;
-		for(int i = 0; i < max_rank; i++, (*rank)++) {
-			if(i >= 2 && reduced_energy >= threshold_energy)
-				break;
-			float s = feature_matrix[i*num_features+i];
-			reduced_energy += s;
-		}
-	}
-	else {
-		for(int i = 0; i < max_rank; i++, (*rank)++) {
-			float s = feature_matrix[i*num_features+i];
-			if(i >= 2 && sqrtf(s) < pca_threshold)
-				break;
-		}
-	}
-
-	math_matrix_transpose(transform, num_features, transform_stride);
-
-	/* Bake the feature scaling into the transformation matrix. */
-	for(int i = 0; i < num_features; i++) {
-		for(int j = 0; j < (*rank); j++) {
-			transform[(i*num_features + j)*transform_stride] *= feature_scale[i];
-		}
-	}
+  int num_features = use_time ? 11 : 10;
+
+  /* === Calculate denoising window. === */
+  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
+  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
+  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
+  const ccl_global float *ccl_restrict pixel_buffer;
+  int3 pixel;
+
+  /* === Shift feature passes to have mean 0. === */
+  float feature_means[DENOISE_FEATURES];
+  math_vector_zero(feature_means, num_features);
+  FOR_PIXEL_WINDOW
+  {
+    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+    math_vector_add(feature_means, features, num_features);
+  }
+  END_FOR_PIXEL_WINDOW
+
+  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
+
+  /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+  float feature_scale[DENOISE_FEATURES];
+  math_vector_zero(feature_scale, num_features);
+
+  FOR_PIXEL_WINDOW
+  {
+    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+    math_vector_max(feature_scale, features, num_features);
+  }
+  END_FOR_PIXEL_WINDOW
+
+  filter_calculate_scale(feature_scale, use_time);
+
+  /* === Generate the feature transformation. ===
+   * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+   * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
+  math_matrix_zero(feature_matrix, num_features);
+  FOR_PIXEL_WINDOW
+  {
+    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+    math_vector_mul(features, feature_scale, num_features);
+    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
+  }
+  END_FOR_PIXEL_WINDOW
+
+  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
+  *rank = 0;
+  /* Prevent overfitting when a small window is used. */
+  int max_rank = min(num_features, num_pixels / 3);
+  if (pca_threshold < 0.0f) {
+    float threshold_energy = 0.0f;
+    for (int i = 0; i < num_features; i++) {
+      threshold_energy += feature_matrix[i * num_features + i];
+    }
+    threshold_energy *= 1.0f - (-pca_threshold);
+
+    float reduced_energy = 0.0f;
+    for (int i = 0; i < max_rank; i++, (*rank)++) {
+      if (i >= 2 && reduced_energy >= threshold_energy)
+        break;
+      float s = feature_matrix[i * num_features + i];
+      reduced_energy += s;
+    }
+  }
+  else {
+    for (int i = 0; i < max_rank; i++, (*rank)++) {
+      float s = feature_matrix[i * num_features + i];
+      if (i >= 2 && sqrtf(s) < pca_threshold)
+        break;
+    }
+  }
+
+  math_matrix_transpose(transform, num_features, transform_stride);
+
+  /* Bake the feature scaling into the transformation matrix. */
+  for (int i = 0; i < num_features; i++) {
+    for (int j = 0; j < (*rank); j++) {
+      transform[(i * num_features + j) * transform_stride] *= feature_scale[i];
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 10bd3e477e9..22397b292db 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -18,98 +18,110 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
                                                   CCL_FILTER_TILE_INFO,
-                                                  int x, int y, int4 rect,
-                                                  int pass_stride, int frame_stride,
+                                                  int x,
+                                                  int y,
+                                                  int4 rect,
+                                                  int pass_stride,
+                                                  int frame_stride,
                                                   bool use_time,
-                                                  float *transform, int *rank,
-                                                  int radius, float pca_threshold)
+                                                  float *transform,
+                                                  int *rank,
+                                                  int radius,
+                                                  float pca_threshold)
 {
-	int buffer_w = align_up(rect.z - rect.x, 4);
-
-	float4 features[DENOISE_FEATURES];
-	const float *ccl_restrict pixel_buffer;
-	int3 pixel;
-
-	int num_features = use_time? 11 : 10;
-
-	/* === Calculate denoising window. === */
-	int2 low  = make_int2(max(rect.x, x - radius),
-	                      max(rect.y, y - radius));
-	int2 high = make_int2(min(rect.z, x + radius + 1),
-	                      min(rect.w, y + radius + 1));
-	int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-	/* === Shift feature passes to have mean 0. === */
-	float4 feature_means[DENOISE_FEATURES];
-	math_vector_zero_sse(feature_means, num_features);
-	FOR_PIXEL_WINDOW_SSE {
-		filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
-		math_vector_add_sse(feature_means, num_features, features);
-	} END_FOR_PIXEL_WINDOW_SSE
-
-	float4 pixel_scale = make_float4(1.0f / num_pixels);
-	for(int i = 0; i < num_features; i++) {
-		feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
-	}
-
-	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
-	float4 feature_scale[DENOISE_FEATURES];
-	math_vector_zero_sse(feature_scale, num_features);
-	FOR_PIXEL_WINDOW_SSE {
-		filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-		math_vector_max_sse(feature_scale, features, num_features);
-	} END_FOR_PIXEL_WINDOW_SSE
-
-	filter_calculate_scale_sse(feature_scale, use_time);
-
-	/* === Generate the feature transformation. ===
-	 * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
-	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
-	float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
-	math_matrix_zero_sse(feature_matrix_sse, num_features);
-	FOR_PIXEL_WINDOW_SSE {
-		filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-		math_vector_mul_sse(features, num_features, feature_scale);
-		math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
-	} END_FOR_PIXEL_WINDOW_SSE
-
-	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
-	math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
-
-	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-
-	*rank = 0;
-	/* Prevent overfitting when a small window is used. */
-	int max_rank = min(num_features, num_pixels/3);
-	if(pca_threshold < 0.0f) {
-		float threshold_energy = 0.0f;
-		for(int i = 0; i < num_features; i++) {
-			threshold_energy += feature_matrix[i*num_features+i];
-		}
-		threshold_energy *= 1.0f - (-pca_threshold);
-
-		float reduced_energy = 0.0f;
-		for(int i = 0; i < max_rank; i++, (*rank)++) {
-			if(i >= 2 && reduced_energy >= threshold_energy)
-				break;
-			float s = feature_matrix[i*num_features+i];
-			reduced_energy += s;
-		}
-	}
-	else {
-		for(int i = 0; i < max_rank; i++, (*rank)++) {
-			float s = feature_matrix[i*num_features+i];
-			if(i >= 2 && sqrtf(s) < pca_threshold)
-				break;
-		}
-	}
-
-	math_matrix_transpose(transform, num_features, 1);
-
-	/* Bake the feature scaling into the transformation matrix. */
-	for(int i = 0; i < num_features; i++) {
-		math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank);
-	}
+  int buffer_w = align_up(rect.z - rect.x, 4);
+
+  float4 features[DENOISE_FEATURES];
+  const float *ccl_restrict pixel_buffer;
+  int3 pixel;
+
+  int num_features = use_time ? 11 : 10;
+
+  /* === Calculate denoising window. === */
+  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
+  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
+  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
+
+  /* === Shift feature passes to have mean 0. === */
+  float4 feature_means[DENOISE_FEATURES];
+  math_vector_zero_sse(feature_means, num_features);
+  FOR_PIXEL_WINDOW_SSE
+  {
+    filter_get_features_sse(
+        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
+    math_vector_add_sse(feature_means, num_features, features);
+  }
+  END_FOR_PIXEL_WINDOW_SSE
+
+  float4 pixel_scale = make_float4(1.0f / num_pixels);
+  for (int i = 0; i < num_features; i++) {
+    feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
+  }
+
+  /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+  float4 feature_scale[DENOISE_FEATURES];
+  math_vector_zero_sse(feature_scale, num_features);
+  FOR_PIXEL_WINDOW_SSE
+  {
+    filter_get_feature_scales_sse(
+        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+    math_vector_max_sse(feature_scale, features, num_features);
+  }
+  END_FOR_PIXEL_WINDOW_SSE
+
+  filter_calculate_scale_sse(feature_scale, use_time);
+
+  /* === Generate the feature transformation. ===
+   * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+   * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+  float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
+  math_matrix_zero_sse(feature_matrix_sse, num_features);
+  FOR_PIXEL_WINDOW_SSE
+  {
+    filter_get_features_sse(
+        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+    math_vector_mul_sse(features, num_features, feature_scale);
+    math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
+  }
+  END_FOR_PIXEL_WINDOW_SSE
+
+  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
+  math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
+
+  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
+
+  *rank = 0;
+  /* Prevent overfitting when a small window is used. */
+  int max_rank = min(num_features, num_pixels / 3);
+  if (pca_threshold < 0.0f) {
+    float threshold_energy = 0.0f;
+    for (int i = 0; i < num_features; i++) {
+      threshold_energy += feature_matrix[i * num_features + i];
+    }
+    threshold_energy *= 1.0f - (-pca_threshold);
+
+    float reduced_energy = 0.0f;
+    for (int i = 0; i < max_rank; i++, (*rank)++) {
+      if (i >= 2 && reduced_energy >= threshold_energy)
+        break;
+      float s = feature_matrix[i * num_features + i];
+      reduced_energy += s;
+    }
+  }
+  else {
+    for (int i = 0; i < max_rank; i++, (*rank)++) {
+      float s = feature_matrix[i * num_features + i];
+      if (i >= 2 && sqrtf(s) < pca_threshold)
+        break;
+    }
+  }
+
+  math_matrix_transpose(transform, num_features, 1);
+
+  /* Bake the feature scaling into the transformation matrix. */
+  for (int i = 0; i < num_features; i++) {
+    math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank);
+  }
 }
 
 CCL_NAMESPACE_END
author	Campbell Barton <ideasman42@gmail.com>	2019-04-17 07:17:24 +0300
committer	Campbell Barton <ideasman42@gmail.com>	2019-04-17 07:21:24 +0300
commit	e12c08e8d170b7ca40f204a5b0423c23a9fbc2c1 (patch)
tree	8cf3453d12edb177a218ef8009357518ec6cab6a /intern/cycles/kernel/filter
parent	b3dabc200a4b0399ec6b81f2ff2730d07b44fcaa (diff)