Cycles: Improve denoising speed on GPUs with small tile sizes

Previously, the NLM kernels would be launched once per offset with one thread per pixel. However, with the smaller tile sizes that are now feasible, there wasn't enough work to fully occupy GPUs which results in a significant slowdown. Therefore, the kernels are now launched in a single call that handles all offsets at once. This has two downsides: Memory accesses to accumulating buffers are now atomic, and more importantly, the temporary memory now has to be allocated for every shift at once, increasing the required memory. On the other hand, of course, the smaller tiles significantly reduce the size of the memory. The main bottleneck right now is the construction of the transformation - there is nothing to be parallelized there, one thread per pixel is the maximum. I tried to parallelize the SVD implementation by storing the matrix in shared memory and launching one block per pixel, but that wasn't really going anywhere. To make the new code somewhat readable, the handling of rectangular regions was cleaned up a bit and commented, it should be easier to understand what's going on now. Also, some variables have been renamed to make the difference between buffer width and stride more apparent, in addition to some general style cleanup.
author: Lukas Stockner <lukas.stockner@freenet.de> 2017-11-10 06:34:14 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2017-11-30 09:37:08 +0300
commit: fa3d50af95fde76ef08590d2f86444f2f9fdca95 (patch)
tree: 516ea6cce9b6b3708389ad182a7dddf2974a1a10 /intern/cycles/kernel/kernels
parent: df7b9fa2eeb5908de4e1b3c2c6f7cf30329f1e3d (diff)
4 files changed, 195 insertions, 125 deletions
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index bf13ba62806..4231aba88d7 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -74,7 +74,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            float *variance,
                                                            float *difference_image,
                                                            int* rect,
-                                                           int w,
+                                                           int stride,
                                                            int channel_offset,
                                                            float a,
                                                            float k_2);
@@ -82,13 +82,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
                                                 float *out_image,
                                                 int* rect,
-                                                int w,
+                                                int stride,
                                                 int f);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
                                                        float *out_image,
                                                        int* rect,
-                                                       int w,
+                                                       int stride,
                                                        int f);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
@@ -98,7 +98,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int* rect,
-                                                         int w,
+                                                         int stride,
                                                          int f);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
@@ -110,22 +110,19 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              float *XtWX,
                                                              float3 *XtWY,
                                                              int *rect,
-                                                             int *filter_rect,
-                                                             int w,
-                                                             int h,
+                                                             int *filter_window,
+                                                             int stride,
                                                              int f,
                                                              int pass_stride);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
                                                      float *accum_image,
                                                      int* rect,
-                                                     int w);
+                                                     int stride);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
                                                 int y,
                                                 int storage_ofs,
-                                                int w,
-                                                int h,
                                                 float *buffer,
                                                 int *rank,
                                                 float *XtWX,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 2fbb0ea2bdb..ab39260784b 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -150,7 +150,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            float *variance,
                                                            float *difference_image,
                                                            int *rect,
-                                                           int w,
+                                                           int stride,
                                                            int channel_offset,
                                                            float a,
                                                            float k_2)
@@ -158,33 +158,33 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
 #else
-	kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2);
+	kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
                                                 float *out_image,
                                                 int *rect,
-                                                int w,
+                                                int stride,
                                                 int f)
 {
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
 #else
-	kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f);
+	kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
                                                        float *out_image,
                                                        int *rect,
-                                                       int w,
+                                                       int stride,
                                                        int f)
 {
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
 #else
-	kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f);
+	kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
 #endif
 }
 
@@ -195,13 +195,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int *rect,
-                                                         int w,
+                                                         int stride,
                                                          int f)
 {
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
 #else
-	kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f);
+	kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
 #endif
 }
 
@@ -214,36 +214,33 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              float *XtWX,
                                                              float3 *XtWY,
                                                              int *rect,
-                                                             int *filter_rect,
-                                                             int w,
-                                                             int h,
+                                                             int *filter_window,
+                                                             int stride,
                                                              int f,
                                                              int pass_stride)
 {
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
 #else
-    kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+	kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
                                                      float *accum_image,
                                                      int *rect,
-                                                     int w)
+                                                     int stride)
 {
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
 #else
-	kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w);
+	kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
                                                 int y,
                                                 int storage_ofs,
-                                                int w,
-                                                int h,
                                                 float *buffer,
                                                 int *rank,
                                                 float *XtWX,
@@ -257,7 +254,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
 	XtWX += storage_ofs*XTWX_SIZE;
 	XtWY += storage_ofs*XTWY_SIZE;
 	rank += storage_ofs;
-	kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+	kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index c8172355a7f..035f0484488 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -134,95 +134,140 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
-                                       const float *ccl_restrict weight_image,
+kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        const float *ccl_restrict variance_image,
                                        float *difference_image,
-                                       int4 rect, int w,
+                                       int w,
+                                       int h,
+                                       int stride,
+                                       int shift_stride,
+                                       int r,
                                        int channel_offset,
-                                       float a, float k_2)
+                                       float a,
+                                       float k_2)
 {
-	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
+		                                  weight_image,
+		                                  variance_image,
+		                                  difference_image + ofs,
+		                                  rect, stride,
+		                                  channel_offset, a, k_2);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
+                            float *out_image,
+                            int w,
+                            int h,
+                            int stride,
+                            int shift_stride,
+                            int r,
+                            int f)
 {
-	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_blur(co.x, co.y,
+		                       difference_image + ofs,
+		                       out_image + ofs,
+		                       rect, stride, f);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                   float *out_image,
+                                   int w,
+                                   int h,
+                                   int stride,
+                                   int shift_stride,
+                                   int r,
+                                   int f)
 {
-	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_calc_weight(co.x, co.y,
+		                              difference_image + ofs,
+		                              out_image + ofs,
+		                              rect, stride, f);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(int dx, int dy,
-                                     const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                      const float *ccl_restrict image,
-                                     float *out_image, float *accum_image,
-                                     int4 rect, int w,
+                                     float *out_image,
+                                     float *accum_image,
+                                     int w,
+                                     int h,
+                                     int stride,
+                                     int shift_stride,
+                                     int r,
                                      int f)
 {
-	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
+		                                difference_image + ofs,
+		                                image,
+		                                out_image,
+		                                accum_image,
+		                                rect, stride, f);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
+kernel_cuda_filter_nlm_normalize(float *out_image,
+                                 const float *ccl_restrict accum_image,
+                                 int w,
+                                 int h,
+                                 int stride)
 {
-	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < w && y < h) {
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
 	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
-                                         const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
                                          const float *ccl_restrict buffer,
                                          float const* __restrict__ transform,
                                          int *rank,
                                          float *XtWX,
                                          float3 *XtWY,
-                                         int4 rect,
-                                         int4 filter_rect,
-                                         int w, int h, int f,
+                                         int4 filter_window,
+                                         int w,
+                                         int h,
+                                         int stride,
+                                         int shift_stride,
+                                         int r,
+                                         int f,
                                          int pass_stride)
 {
-	int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
-	int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
-	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
-		kernel_filter_nlm_construct_gramian(x, y,
-		                                    dx, dy,
-		                                    difference_image,
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+		kernel_filter_nlm_construct_gramian(co.x, co.y,
+		                                    co.z, co.w,
+		                                    difference_image + ofs,
 		                                    buffer,
 		                                    transform, rank,
 		                                    XtWX, XtWY,
-		                                    rect, filter_rect,
-		                                    w, h, f,
+		                                    rect, filter_window,
+		                                    stride, f,
 		                                    pass_stride,
 		                                    threadIdx.y*blockDim.x + threadIdx.x);
 	}
@@ -230,10 +275,12 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(int w, int h,
-                            float *buffer, int *rank,
-                            float *XtWX, float3 *XtWY,
-                            int4 filter_area, int4 buffer_params,
+kernel_cuda_filter_finalize(float *buffer,
+                            int *rank,
+                            float *XtWX,
+                            float3 *XtWY,
+                            int4 filter_area,
+                            int4 buffer_params,
                             int sample)
 {
 	int x = blockDim.x*blockIdx.x + threadIdx.x;
@@ -243,7 +290,10 @@ kernel_cuda_filter_finalize(int w, int h,
 		rank += storage_ofs;
 		XtWX += storage_ofs;
 		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+		kernel_filter_finalize(x, y, buffer, rank,
+		                       filter_area.z*filter_area.w,
+		                       XtWX, XtWY,
+		                       buffer_params, sample);
 	}
 }
 
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index 7a7b596a350..2b77807c38b 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -126,113 +126,136 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
 	}
 }
 
-__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
-                                                    int dy,
-                                                    const ccl_global float *ccl_restrict weight_image,
+__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
                                                     const ccl_global float *ccl_restrict variance_image,
                                                     ccl_global float *difference_image,
-                                                    int4 rect,
                                                     int w,
+                                                    int h,
+                                                    int stride,
+                                                    int shift_stride,
+                                                    int r,
                                                     int channel_offset,
                                                     float a,
                                                     float k_2)
 {
-	int x = get_global_id(0) + rect.x;
-	int y = get_global_id(1) + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
+		                                  weight_image,
+		                                  variance_image,
+		                                  difference_image + ofs,
+		                                  rect, stride,
+		                                  channel_offset, a, k_2);
 	}
 }
 
 __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
                                          ccl_global float *out_image,
-                                         int4 rect,
                                          int w,
+                                         int h,
+                                         int stride,
+                                         int shift_stride,
+                                         int r,
                                          int f)
 {
-	int x = get_global_id(0) + rect.x;
-	int y = get_global_id(1) + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_blur(co.x, co.y,
+		                       difference_image + ofs,
+		                       out_image + ofs,
+		                       rect, stride, f);
 	}
 }
 
 __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
                                                 ccl_global float *out_image,
-                                                int4 rect,
                                                 int w,
+                                                int h,
+                                                int stride,
+                                                int shift_stride,
+                                                int r,
                                                 int f)
 {
-	int x = get_global_id(0) + rect.x;
-	int y = get_global_id(1) + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_calc_weight(co.x, co.y,
+		                              difference_image + ofs,
+		                              out_image + ofs,
+		                              rect, stride, f);
 	}
 }
 
-__kernel void kernel_ocl_filter_nlm_update_output(int dx,
-                                                  int dy,
-                                                  const ccl_global float *ccl_restrict difference_image,
+__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
                                                   const ccl_global float *ccl_restrict image,
                                                   ccl_global float *out_image,
                                                   ccl_global float *accum_image,
-                                                  int4 rect,
                                                   int w,
+                                                  int h,
+                                                  int stride,
+                                                  int shift_stride,
+                                                  int r,
                                                   int f)
 {
-	int x = get_global_id(0) + rect.x;
-	int y = get_global_id(1) + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
+		                                difference_image + ofs,
+		                                image,
+		                                out_image,
+		                                accum_image,
+		                                rect, stride, f);
 	}
 }
 
 __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
                                               const ccl_global float *ccl_restrict accum_image,
-                                              int4 rect,
-                                              int w)
+                                              int w,
+                                              int h,
+                                              int stride)
 {
-	int x = get_global_id(0) + rect.x;
-	int y = get_global_id(1) + rect.y;
-	if(x < rect.z && y < rect.w) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < w && y < h) {
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
 	}
 }
 
-__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
-                                                      int dy,
-                                                      const ccl_global float *ccl_restrict difference_image,
+__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image,
                                                       const ccl_global float *ccl_restrict buffer,
                                                       const ccl_global float *ccl_restrict transform,
                                                       ccl_global int *rank,
                                                       ccl_global float *XtWX,
                                                       ccl_global float3 *XtWY,
-                                                      int4 rect,
-                                                      int4 filter_rect,
+                                                      int4 filter_window,
                                                       int w,
                                                       int h,
+                                                      int stride,
+                                                      int shift_stride,
+                                                      int r,
                                                       int f,
                                                       int pass_stride)
 {
-	int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
-	int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
-	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
-		kernel_filter_nlm_construct_gramian(x, y,
-		                                    dx, dy,
-		                                    difference_image,
+	int4 co, rect;
+	int ofs;
+	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+		kernel_filter_nlm_construct_gramian(co.x, co.y,
+		                                    co.z, co.w,
+		                                    difference_image + ofs,
 		                                    buffer,
 		                                    transform, rank,
 		                                    XtWX, XtWY,
-		                                    rect, filter_rect,
-		                                    w, h, f,
+		                                    rect, filter_window,
+		                                    stride, f,
 		                                    pass_stride,
 		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
 	}
 }
 
-__kernel void kernel_ocl_filter_finalize(int w,
-                                         int h,
-                                         ccl_global float *buffer,
+__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
                                          ccl_global int *rank,
                                          ccl_global float *XtWX,
                                          ccl_global float3 *XtWY,
@@ -247,7 +270,10 @@ __kernel void kernel_ocl_filter_finalize(int w,
 		rank += storage_ofs;
 		XtWX += storage_ofs;
 		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+		kernel_filter_finalize(x, y, buffer, rank,
+		                       filter_area.z*filter_area.w,
+		                       XtWX, XtWY,
+		                       buffer_params, sample);
 	}
 }
author	Lukas Stockner <lukas.stockner@freenet.de>	2017-11-10 06:34:14 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2017-11-30 09:37:08 +0300
commit	fa3d50af95fde76ef08590d2f86444f2f9fdca95 (patch)
tree	516ea6cce9b6b3708389ad182a7dddf2974a1a10 /intern/cycles/kernel/kernels
parent	df7b9fa2eeb5908de4e1b3c2c6f7cf30329f1e3d (diff)