1 files changed, 197 insertions, 168 deletions
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 12636393243..650c743f34f 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -24,203 +24,232 @@ CCL_NAMESPACE_BEGIN
  * Window is the rect that should be processed.
  * co is filled with (x, y, dx, dy).
  */
-ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
-                                             int4 *rect, int4 *co, int *ofs,
-                                             int4 window)
+ccl_device_inline bool get_nlm_coords_window(
+    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window)
 {
-	/* Determine the pixel offset that this thread should apply. */
-	int s = 2*r+1;
-	int si = ccl_global_id(1);
-	int sx = si % s;
-	int sy = si / s;
-	if(sy >= s) {
-		return false;
-	}
-
-	/* Pixels still need to lie inside the denoising buffer after applying the offset,
-	 * so determine the area for which this is the case. */
-	int dx = sx - r;
-	int dy = sy - r;
-
-	*rect = make_int4(max(0, -dx),     max(0, -dy),
-	              w - max(0,  dx), h - max(0,  dy));
-
-	/* Find the intersection of the area that we want to process (window) and the area
-	 * that can be processed (rect) to get the final area for this offset. */
-	int4 clip_area = rect_clip(window, *rect);
-
-	/* If the radius is larger than one of the sides of the window,
-	 * there will be shifts for which there is no usable pixel at all. */
-	if(!rect_is_valid(clip_area)) {
-		return false;
-	}
-
-	/* Map the linear thread index to pixels inside the clip area. */
-	int x, y;
-	if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
-		return false;
-	}
-
-	*co = make_int4(x, y, dx, dy);
-
-	*ofs = (sy*s + sx) * stride;
-
-	return true;
+  /* Determine the pixel offset that this thread should apply. */
+  int s = 2 * r + 1;
+  int si = ccl_global_id(1);
+  int sx = si % s;
+  int sy = si / s;
+  if (sy >= s) {
+    return false;
+  }
+
+  /* Pixels still need to lie inside the denoising buffer after applying the offset,
+   * so determine the area for which this is the case. */
+  int dx = sx - r;
+  int dy = sy - r;
+
+  *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy));
+
+  /* Find the intersection of the area that we want to process (window) and the area
+   * that can be processed (rect) to get the final area for this offset. */
+  int4 clip_area = rect_clip(window, *rect);
+
+  /* If the radius is larger than one of the sides of the window,
+   * there will be shifts for which there is no usable pixel at all. */
+  if (!rect_is_valid(clip_area)) {
+    return false;
+  }
+
+  /* Map the linear thread index to pixels inside the clip area. */
+  int x, y;
+  if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
+    return false;
+  }
+
+  *co = make_int4(x, y, dx, dy);
+
+  *ofs = (sy * s + sx) * stride;
+
+  return true;
 }
 
-ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride,
-                                      int4 *rect, int4 *co, int *ofs)
+ccl_device_inline bool get_nlm_coords(
+    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs)
 {
-	return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
+  return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
-                                                         int dx, int dy,
-                                                         const ccl_global float *ccl_restrict weight_image,
-                                                         const ccl_global float *ccl_restrict variance_image,
-                                                         const ccl_global float *ccl_restrict scale_image,
-                                                         ccl_global float *difference_image,
-                                                         int4 rect, int stride,
-                                                         int channel_offset,
-                                                         int frame_offset,
-                                                         float a, float k_2)
+ccl_device_inline void kernel_filter_nlm_calc_difference(
+    int x,
+    int y,
+    int dx,
+    int dy,
+    const ccl_global float *ccl_restrict weight_image,
+    const ccl_global float *ccl_restrict variance_image,
+    const ccl_global float *ccl_restrict scale_image,
+    ccl_global float *difference_image,
+    int4 rect,
+    int stride,
+    int channel_offset,
+    int frame_offset,
+    float a,
+    float k_2)
 {
-	int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset;
-	int numChannels = channel_offset? 3 : 1;
-
-	float diff = 0.0f;
-	float scale_fac = 1.0f;
-	if(scale_image) {
-		scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
-	}
-
-	for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
-		float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q];
-		float pvar = variance_image[idx_p];
-		float qvar = sqr(scale_fac)*variance_image[idx_q];
-		diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
-	}
-	if(numChannels > 1) {
-		diff *= 1.0f/numChannels;
-	}
-	difference_image[y*stride + x] = diff;
+  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset;
+  int numChannels = channel_offset ? 3 : 1;
+
+  float diff = 0.0f;
+  float scale_fac = 1.0f;
+  if (scale_image) {
+    scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
+  }
+
+  for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
+    float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q];
+    float pvar = variance_image[idx_p];
+    float qvar = sqr(scale_fac) * variance_image[idx_q];
+    diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar));
+  }
+  if (numChannels > 1) {
+    diff *= 1.0f / numChannels;
+  }
+  difference_image[y * stride + x] = diff;
 }
 
-ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
-                                              const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_blur(int x,
+                                              int y,
+                                              const ccl_global float *ccl_restrict
+                                                  difference_image,
                                               ccl_global float *out_image,
-                                              int4 rect, int stride, int f)
+                                              int4 rect,
+                                              int stride,
+                                              int f)
 {
-	float sum = 0.0f;
-	const int low = max(rect.y, y-f);
-	const int high = min(rect.w, y+f+1);
-	for(int y1 = low; y1 < high; y1++) {
-		sum += difference_image[y1*stride + x];
-	}
-	sum *= 1.0f/(high-low);
-	out_image[y*stride + x] = sum;
+  float sum = 0.0f;
+  const int low = max(rect.y, y - f);
+  const int high = min(rect.w, y + f + 1);
+  for (int y1 = low; y1 < high; y1++) {
+    sum += difference_image[y1 * stride + x];
+  }
+  sum *= 1.0f / (high - low);
+  out_image[y * stride + x] = sum;
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
-                                                     const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x,
+                                                     int y,
+                                                     const ccl_global float *ccl_restrict
+                                                         difference_image,
                                                      ccl_global float *out_image,
-                                                     int4 rect, int stride, int f)
+                                                     int4 rect,
+                                                     int stride,
+                                                     int f)
 {
-	float sum = 0.0f;
-	const int low = max(rect.x, x-f);
-	const int high = min(rect.z, x+f+1);
-	for(int x1 = low; x1 < high; x1++) {
-		sum += difference_image[y*stride + x1];
-	}
-	sum *= 1.0f/(high-low);
-	out_image[y*stride + x] = fast_expf(-max(sum, 0.0f));
+  float sum = 0.0f;
+  const int low = max(rect.x, x - f);
+  const int high = min(rect.z, x + f + 1);
+  for (int x1 = low; x1 < high; x1++) {
+    sum += difference_image[y * stride + x1];
+  }
+  sum *= 1.0f / (high - low);
+  out_image[y * stride + x] = fast_expf(-max(sum, 0.0f));
 }
 
-ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
-                                                       int dx, int dy,
-                                                       const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_update_output(int x,
+                                                       int y,
+                                                       int dx,
+                                                       int dy,
+                                                       const ccl_global float *ccl_restrict
+                                                           difference_image,
                                                        const ccl_global float *ccl_restrict image,
                                                        ccl_global float *out_image,
                                                        ccl_global float *accum_image,
-                                                       int4 rect, int channel_offset,
-                                                       int stride, int f)
+                                                       int4 rect,
+                                                       int channel_offset,
+                                                       int stride,
+                                                       int f)
 {
-	float sum = 0.0f;
-	const int low = max(rect.x, x-f);
-	const int high = min(rect.z, x+f+1);
-	for(int x1 = low; x1 < high; x1++) {
-		sum += difference_image[y*stride + x1];
-	}
-	sum *= 1.0f/(high-low);
-
-	int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
-	if(out_image) {
-		atomic_add_and_fetch_float(accum_image + idx_p, sum);
-
-		float val = image[idx_q];
-		if(channel_offset) {
-			val += image[idx_q + channel_offset];
-			val += image[idx_q + 2*channel_offset];
-			val *= 1.0f/3.0f;
-		}
-		atomic_add_and_fetch_float(out_image + idx_p, sum*val);
-	}
-	else {
-		accum_image[idx_p] = sum;
-	}
+  float sum = 0.0f;
+  const int low = max(rect.x, x - f);
+  const int high = min(rect.z, x + f + 1);
+  for (int x1 = low; x1 < high; x1++) {
+    sum += difference_image[y * stride + x1];
+  }
+  sum *= 1.0f / (high - low);
+
+  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
+  if (out_image) {
+    atomic_add_and_fetch_float(accum_image + idx_p, sum);
+
+    float val = image[idx_q];
+    if (channel_offset) {
+      val += image[idx_q + channel_offset];
+      val += image[idx_q + 2 * channel_offset];
+      val *= 1.0f / 3.0f;
+    }
+    atomic_add_and_fetch_float(out_image + idx_p, sum * val);
+  }
+  else {
+    accum_image[idx_p] = sum;
+  }
 }
 
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
-                                                           int dx, int dy, int t,
-                                                           const ccl_global float *ccl_restrict difference_image,
-                                                           const ccl_global float *ccl_restrict buffer,
-                                                           const ccl_global float *ccl_restrict transform,
-                                                           ccl_global int *rank,
-                                                           ccl_global float *XtWX,
-                                                           ccl_global float3 *XtWY,
-                                                           int4 rect,
-                                                           int4 filter_window,
-                                                           int stride, int f,
-                                                           int pass_stride,
-                                                           int frame_offset,
-                                                           bool use_time,
-                                                           int localIdx)
+ccl_device_inline void kernel_filter_nlm_construct_gramian(
+    int x,
+    int y,
+    int dx,
+    int dy,
+    int t,
+    const ccl_global float *ccl_restrict difference_image,
+    const ccl_global float *ccl_restrict buffer,
+    const ccl_global float *ccl_restrict transform,
+    ccl_global int *rank,
+    ccl_global float *XtWX,
+    ccl_global float3 *XtWY,
+    int4 rect,
+    int4 filter_window,
+    int stride,
+    int f,
+    int pass_stride,
+    int frame_offset,
+    bool use_time,
+    int localIdx)
 {
-	const int low = max(rect.x, x-f);
-	const int high = min(rect.z, x+f+1);
-	float sum = 0.0f;
-	for(int x1 = low; x1 < high; x1++) {
-		sum += difference_image[y*stride + x1];
-	}
-	float weight = sum * (1.0f/(high - low));
-
-	/* Reconstruction data is only stored for pixels inside the filter window,
-	 * so compute the pixels's index in there. */
-	int storage_ofs = coord_to_local_index(filter_window, x, y);
-	transform += storage_ofs;
-	rank += storage_ofs;
-	XtWX += storage_ofs;
-	XtWY += storage_ofs;
-
-	kernel_filter_construct_gramian(x, y,
-	                                rect_size(filter_window),
-	                                dx, dy, t,
-	                                stride,
-	                                pass_stride,
-	                                frame_offset,
-	                                use_time,
-	                                buffer,
-	                                transform, rank,
-	                                weight, XtWX, XtWY,
-	                                localIdx);
+  const int low = max(rect.x, x - f);
+  const int high = min(rect.z, x + f + 1);
+  float sum = 0.0f;
+  for (int x1 = low; x1 < high; x1++) {
+    sum += difference_image[y * stride + x1];
+  }
+  float weight = sum * (1.0f / (high - low));
+
+  /* Reconstruction data is only stored for pixels inside the filter window,
+   * so compute the pixels's index in there. */
+  int storage_ofs = coord_to_local_index(filter_window, x, y);
+  transform += storage_ofs;
+  rank += storage_ofs;
+  XtWX += storage_ofs;
+  XtWY += storage_ofs;
+
+  kernel_filter_construct_gramian(x,
+                                  y,
+                                  rect_size(filter_window),
+                                  dx,
+                                  dy,
+                                  t,
+                                  stride,
+                                  pass_stride,
+                                  frame_offset,
+                                  use_time,
+                                  buffer,
+                                  transform,
+                                  rank,
+                                  weight,
+                                  XtWX,
+                                  XtWY,
+                                  localIdx);
 }
 
-ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+ccl_device_inline void kernel_filter_nlm_normalize(int x,
+                                                   int y,
                                                    ccl_global float *out_image,
-                                                   const ccl_global float *ccl_restrict accum_image,
+                                                   const ccl_global float *ccl_restrict
+                                                       accum_image,
                                                    int stride)
 {
-	out_image[y*stride + x] /= accum_image[y*stride + x];
+  out_image[y * stride + x] /= accum_image[y * stride + x];
 }
 
 CCL_NAMESPACE_END