Cycles: Improve denoising speed on GPUs with small tile sizes

Previously, the NLM kernels would be launched once per offset with one thread per pixel. However, with the smaller tile sizes that are now feasible, there wasn't enough work to fully occupy GPUs which results in a significant slowdown. Therefore, the kernels are now launched in a single call that handles all offsets at once. This has two downsides: Memory accesses to accumulating buffers are now atomic, and more importantly, the temporary memory now has to be allocated for every shift at once, increasing the required memory. On the other hand, of course, the smaller tiles significantly reduce the size of the memory. The main bottleneck right now is the construction of the transformation - there is nothing to be parallelized there, one thread per pixel is the maximum. I tried to parallelize the SVD implementation by storing the matrix in shared memory and launching one block per pixel, but that wasn't really going anywhere. To make the new code somewhat readable, the handling of rectangular regions was cleaned up a bit and commented, it should be easier to understand what's going on now. Also, some variables have been renamed to make the difference between buffer width and stride more apparent, in addition to some general style cleanup.
author: Lukas Stockner <lukas.stockner@freenet.de> 2017-11-10 06:34:14 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2017-11-30 09:37:08 +0300
commit: fa3d50af95fde76ef08590d2f86444f2f9fdca95 (patch)
tree: 516ea6cce9b6b3708389ad182a7dddf2974a1a10 /intern/cycles/device
parent: df7b9fa2eeb5908de4e1b3c2c6f7cf30329f1e3d (diff)
6 files changed, 271 insertions, 252 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 999b9230d29..2d28ccd2b49 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -190,9 +190,9 @@ public:
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
 	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
 
-	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                              filter_construct_transform_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
-	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                       filter_finalize_kernel;
+	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
+	KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                            filter_finalize_kernel;
 
 	KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
 	                       int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -565,13 +565,13 @@ public:
 			                                    (float*) color_variance_ptr,
 			                                    difference,
 			                                    local_rect,
-			                                    task->buffer.w,
+			                                    task->buffer.stride,
 			                                    task->buffer.pass_stride,
 			                                    1.0f,
 			                                    task->nlm_k_2);
-			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
-			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
-			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
 			filter_nlm_construct_gramian_kernel()(dx, dy,
 			                                      blurDifference,
 			                                      (float*)  task->buffer.mem.device_pointer,
@@ -580,9 +580,8 @@ public:
 			                                      (float*)  task->storage.XtWX.device_pointer,
 			                                      (float3*) task->storage.XtWY.device_pointer,
 			                                      local_rect,
-			                                      &task->reconstruction_state.filter_rect.x,
-			                                      task->buffer.w,
-			                                      task->buffer.h,
+			                                      &task->reconstruction_state.filter_window.x,
+			                                      task->buffer.stride,
 			                                      4,
 			                                      task->buffer.pass_stride);
 		}
@@ -591,8 +590,6 @@ public:
 				filter_finalize_kernel()(x,
 				                         y,
 				                         y*task->filter_area.z + x,
-				                         task->buffer.w,
-				                         task->buffer.h,
 				                         (float*)  output_ptr,
 				                         (int*)    task->storage.rank.device_pointer,
 				                         (float*)  task->storage.XtWX.device_pointer,
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d8d787ba706..a663da748df 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1087,6 +1087,19 @@ public:
 			                           threads, threads, 1, \
 			                           0, 0, args, 0));
 
+/* Similar as above, but for 1-dimensional blocks. */
+#define CUDA_GET_BLOCKSIZE_1D(func, w, h)                                                                       \
+			int threads_per_block;                                                                              \
+			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+			int xblocks = ((w) + threads_per_block - 1)/threads_per_block;                                      \
+			int yblocks = h;
+
+#define CUDA_LAUNCH_KERNEL_1D(func, args)                       \
+			cuda_assert(cuLaunchKernel(func,                    \
+			                           xblocks, yblocks, 1,     \
+			                           threads_per_block, 1, 1, \
+			                           0, 0, args, 0));
+
 	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
 	                               DenoisingTask *task)
 	{
@@ -1095,60 +1108,65 @@ public:
 
 		CUDAContextScope scope(this);
 
-		int4 rect = task->rect;
-		int w = align_up(rect.z-rect.x, 4);
-		int h = rect.w-rect.y;
+		int stride = task->buffer.stride;
+		int w = task->buffer.width;
+		int h = task->buffer.h;
 		int r = task->nlm_state.r;
 		int f = task->nlm_state.f;
 		float a = task->nlm_state.a;
 		float k_2 = task->nlm_state.k_2;
 
-		CUdeviceptr difference     = task->nlm_state.temporary_1_ptr;
-		CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
-		CUdeviceptr weightAccum    = task->nlm_state.temporary_3_ptr;
+		int shift_stride = stride*h;
+		int num_shifts = (2*r+1)*(2*r+1);
+		int mem_size = sizeof(float)*shift_stride*2*num_shifts;
+		int channel_offset = 0;
 
-		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
-		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+		CUdeviceptr temporary_mem;
+		cuda_assert(cuMemAlloc(&temporary_mem, mem_size));
+		CUdeviceptr difference     = temporary_mem;
+		CUdeviceptr blurDifference = temporary_mem + sizeof(float)*shift_stride * num_shifts;
 
-		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
-		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-		cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-		cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-		cuda_assert(cuModuleGetFunction(&cuNLMNormalize,      cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+		CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride));
+		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride));
 
-		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize,      CU_FUNC_CACHE_PREFER_L1));
+		{
+			CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+			cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+			cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
 
-		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
 
-		int dx, dy;
-		int4 local_rect;
-		int channel_offset = 0;
-		void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
-		void *blur_args[]            = {&difference, &blurDifference, &local_rect, &w, &f};
-		void *calc_weight_args[]     = {&blurDifference, &difference, &local_rect, &w, &f};
-		void *update_output_args[]   = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
-
-		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-			dy = i / (2*r+1) - r;
-			dx = i % (2*r+1) - r;
-			local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
-
-			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
-			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
-			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-			CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
-		}
-
-		local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
-		void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
-		CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-		cuda_assert(cuCtxSynchronize());
+			CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
+
+			void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2};
+			void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &shift_stride, &r, &f};
+
+			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+		}
+
+		cuMemFree(temporary_mem);
+
+		{
+			CUfunction cuNLMNormalize;
+			cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+			void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+			CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+			CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+			cuda_assert(cuCtxSynchronize());
+		}
 
 		return !have_error();
 	}
@@ -1194,91 +1212,81 @@ public:
 		mem_zero(task->storage.XtWX);
 		mem_zero(task->storage.XtWY);
 
-		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
-		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-		cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-		cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-		cuda_assert(cuModuleGetFunction(&cuFinalize,            cuFilterModule, "kernel_cuda_filter_finalize"));
+		int r = task->radius;
+		int f = 4;
+		float a = 1.0f;
+		float k_2 = task->nlm_k_2;
 
-		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-		cuda_assert(cuFuncSetCacheConfig(cuFinalize,            CU_FUNC_CACHE_PREFER_L1));
+		int w = task->reconstruction_state.source_w;
+		int h = task->reconstruction_state.source_h;
+		int stride = task->buffer.stride;
 
-		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
-		                   task->reconstruction_state.source_w,
-		                   task->reconstruction_state.source_h);
+		int shift_stride = stride*h;
+		int num_shifts = (2*r+1)*(2*r+1);
+		int mem_size = sizeof(float)*shift_stride*num_shifts;
 
-		CUdeviceptr difference     = task->reconstruction_state.temporary_1_ptr;
-		CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+		CUdeviceptr temporary_mem;
+		cuda_assert(cuMemAlloc(&temporary_mem, 2*mem_size));
+		CUdeviceptr difference     = temporary_mem;
+		CUdeviceptr blurDifference = temporary_mem + mem_size;
 
-		int r = task->radius;
-		int f = 4;
-		float a = 1.0f;
-		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-			int dy = i / (2*r+1) - r;
-			int dx = i % (2*r+1) - r;
-
-			int local_rect[4] = {max(0, -dx), max(0, -dy),
-			                     task->reconstruction_state.source_w - max(0, dx),
-			                     task->reconstruction_state.source_h - max(0, dy)};
-
-			void *calc_difference_args[] = {&dx, &dy,
-			                                &color_ptr,
-			                                &color_variance_ptr,
-			                                &difference,
-			                                &local_rect,
-			                                &task->buffer.w,
-			                                &task->buffer.pass_stride,
-			                                &a,
-			                                &task->nlm_k_2};
-			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
-
-			void *blur_args[] = {&difference,
-			                     &blurDifference,
-			                     &local_rect,
-			                     &task->buffer.w,
-			                     &f};
-			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-
-			void *calc_weight_args[] = {&blurDifference,
-			                            &difference,
-			                            &local_rect,
-			                            &task->buffer.w,
-			                            &f};
-			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
-
-			/* Reuse previous arguments. */
-			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-
-			void *construct_gramian_args[] = {&dx, &dy,
-			                                  &blurDifference,
+		{
+			CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+			cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+			cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+			cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+			CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+			                     task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+			                     num_shifts);
+
+			void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2};
+			void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *construct_gramian_args[] = {&blurDifference,
 			                                  &task->buffer.mem.device_pointer,
 			                                  &task->storage.transform.device_pointer,
 			                                  &task->storage.rank.device_pointer,
 			                                  &task->storage.XtWX.device_pointer,
 			                                  &task->storage.XtWY.device_pointer,
-			                                  &local_rect,
-			                                  &task->reconstruction_state.filter_rect,
-			                                  &task->buffer.w,
-			                                  &task->buffer.h,
+			                                  &task->reconstruction_state.filter_window,
+			                                  &w, &h, &stride,
+			                                  &shift_stride, &r,
 			                                  &f,
 		                                      &task->buffer.pass_stride};
-			CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
-		}
-
-		void *finalize_args[] = {&task->buffer.w,
-		                         &task->buffer.h,
-		                         &output_ptr,
-				                 &task->storage.rank.device_pointer,
-				                 &task->storage.XtWX.device_pointer,
-				                 &task->storage.XtWY.device_pointer,
-				                 &task->filter_area,
-				                 &task->reconstruction_state.buffer_params.x,
-				                 &task->render_buffer.samples};
-		CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+
+			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+		}
+
+		cuMemFree(temporary_mem);
+
+		{
+			CUfunction cuFinalize;
+			cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+			cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+			void *finalize_args[] = {&output_ptr,
+					                 &task->storage.rank.device_pointer,
+					                 &task->storage.XtWX.device_pointer,
+					                 &task->storage.XtWY.device_pointer,
+					                 &task->filter_area,
+					                 &task->reconstruction_state.buffer_params.x,
+					                 &task->render_buffer.samples};
+			CUDA_GET_BLOCKSIZE(cuFinalize,
+			                   task->reconstruction_state.source_w,
+			                   task->reconstruction_state.source_h);
+			CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+		}
+
 		cuda_assert(cuCtxSynchronize());
 
 		return !have_error();
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 69c43e4a8cf..1862deb9a61 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -57,10 +57,9 @@ void DenoisingTask::init_from_devicetask(const DeviceTask &task)
 	render_buffer.denoising_clean_offset = task.pass_denoising_clean;
 
 	/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
-	rect = make_int4(max(tiles->x[0], filter_area.x - radius),
-	                 max(tiles->y[0], filter_area.y - radius),
-	                 min(tiles->x[3], filter_area.x + filter_area.z + radius),
-	                 min(tiles->y[3], filter_area.y + filter_area.w + radius));
+	rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
+	rect = rect_expand(rect, radius);
+	rect = rect_clip(rect, make_int4(tiles->x[0], tiles->y[0], tiles->x[3], tiles->y[3]));
 }
 
 void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
@@ -93,9 +92,10 @@ bool DenoisingTask::run_denoising()
 {
 	/* Allocate denoising buffer. */
 	buffer.passes = 14;
-	buffer.w = align_up(rect.z - rect.x, 4);
+	buffer.width = rect.z - rect.x;
+	buffer.stride = align_up(buffer.width, 4);
 	buffer.h = rect.w - rect.y;
-	buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
+	buffer.pass_stride = align_up(buffer.stride * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
 	buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false);
 
 	device_ptr null_ptr = (device_ptr) 0;
@@ -203,15 +203,17 @@ bool DenoisingTask::run_denoising()
 
 	functions.construct_transform();
 
-	storage.temporary_1.alloc_to_device(buffer.w*buffer.h, false);
-	storage.temporary_2.alloc_to_device(buffer.w*buffer.h, false);
-	reconstruction_state.temporary_1_ptr = storage.temporary_1.device_pointer;
-	reconstruction_state.temporary_2_ptr = storage.temporary_2.device_pointer;
+	device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1");
+	device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2");
+	temporary_1.alloc_to_device(buffer.pass_stride, false);
+	temporary_2.alloc_to_device(buffer.pass_stride, false);
+	reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+	reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
 
 	storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
 	storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
 
-	reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+	reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
 	int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
 	reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
 	                                               render_buffer.stride,
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index ec4e7933cdc..77a82d0ad04 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -94,7 +94,7 @@ public:
 		device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
 		device_ptr temporary_2_ptr;
 
-		int4 filter_rect;
+		int4 filter_window;
 		int4 buffer_params;
 
 		int source_w;
@@ -148,8 +148,9 @@ public:
 	struct DenoiseBuffers {
 		int pass_stride;
 		int passes;
-		int w;
+		int stride;
 		int h;
+		int width;
 		device_only_memory<float> mem;
 
 		DenoiseBuffers(Device *device)
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index c02f8ffafe6..f38c2f65c1e 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -353,7 +353,9 @@ public:
 	void tex_free(device_memory& mem);
 
 	size_t global_size_round_up(int group_size, int global_size);
-	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
+	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h,
+	                    bool x_workgroups = false,
+	                    size_t max_workgroup_size = -1);
 	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
 	void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
 
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index f43177247ef..fe084edc90e 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -560,7 +560,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
 	return global_size + ((r == 0)? 0: group_size - r);
 }
 
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
 {
 	size_t workgroup_size, max_work_items[3];
 
@@ -574,8 +574,15 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size
 	}
 
 	/* Try to divide evenly over 2 dimensions. */
-	size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-	size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
+	size_t local_size[2];
+	if(x_workgroups) {
+		local_size[0] = workgroup_size;
+		local_size[1] = 1;
+	}
+	else {
+		size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
+		local_size[0] = local_size[1] = sqrt_workgroup_size;
+	}
 
 	/* Some implementations have max size 1 on 2nd dimension. */
 	if(local_size[1] > max_work_items[1]) {
@@ -731,17 +738,25 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
                                                  device_ptr out_ptr,
                                                  DenoisingTask *task)
 {
-	int4 rect = task->rect;
-	int w = rect.z-rect.x;
-	int h = rect.w-rect.y;
+
+	int stride = task->buffer.stride;
+	int w = task->buffer.width;
+	int h = task->buffer.h;
 	int r = task->nlm_state.r;
 	int f = task->nlm_state.f;
 	float a = task->nlm_state.a;
 	float k_2 = task->nlm_state.k_2;
 
-	cl_mem difference     = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
-	cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
-	cl_mem weightAccum    = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+	int shift_stride = stride*h;
+	int num_shifts = (2*r+1)*(2*r+1);
+	int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+	cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+	cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+	opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
+	cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+	opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
 
 	cl_mem image_mem = CL_MEM_PTR(image_ptr);
 	cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
@@ -757,31 +772,45 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
 	cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
 	cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
 
-	for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-		int dy = i / (2*r+1) - r;
-		int dx = i % (2*r+1) - r;
-		int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
-		kernel_set_args(ckNLMCalcDifference, 0,
-		                dx, dy, guide_mem, variance_mem,
-		                difference, local_rect, w, 0, a, k_2);
-		kernel_set_args(ckNLMBlur, 0,
-		                difference, blurDifference, local_rect, w, f);
-		kernel_set_args(ckNLMCalcWeight, 0,
-		                blurDifference, difference, local_rect, w, f);
-		kernel_set_args(ckNLMUpdateOutput, 0,
-		                dx, dy, blurDifference, image_mem,
-		                out_mem, weightAccum, local_rect, w, f);
-
-		enqueue_kernel(ckNLMCalcDifference, w, h);
-		enqueue_kernel(ckNLMBlur,           w, h);
-		enqueue_kernel(ckNLMCalcWeight,     w, h);
-		enqueue_kernel(ckNLMBlur,           w, h);
-		enqueue_kernel(ckNLMUpdateOutput,   w, h);
-	}
+	kernel_set_args(ckNLMCalcDifference, 0,
+	                guide_mem,
+	                variance_mem,
+	                difference,
+	                w, h, stride,
+	                shift_stride,
+	                r, 0, a, k_2);
+	kernel_set_args(ckNLMBlur, 0,
+	                difference,
+	                blurDifference,
+	                w, h, stride,
+	                shift_stride,
+	                r, f);
+	kernel_set_args(ckNLMCalcWeight, 0,
+	                blurDifference,
+	                difference,
+	                w, h, stride,
+	                shift_stride,
+	                r, f);
+	kernel_set_args(ckNLMUpdateOutput, 0,
+	                blurDifference,
+	                image_mem,
+	                out_mem,
+	                weightAccum,
+	                w, h, stride,
+	                shift_stride,
+	                r, f);
+
+	enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
+	enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
+	enqueue_kernel(ckNLMCalcWeight,     w*h, num_shifts, true);
+	enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
+	enqueue_kernel(ckNLMUpdateOutput,   w*h, num_shifts, true);
+
+	opencl_assert(clReleaseMemObject(difference));
+	opencl_assert(clReleaseMemObject(blurDifference));
 
-	int4 local_rect = make_int4(0, 0, w, h);
 	kernel_set_args(ckNLMNormalize, 0,
-	                out_mem, weightAccum, local_rect, w);
+	                out_mem, weightAccum, w, h, stride);
 	enqueue_kernel(ckNLMNormalize, w, h);
 
 	return true;
@@ -837,81 +866,63 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
 	cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
 	cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
 
-	cl_mem difference     = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
-	cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
-
-	int r = task->radius;
-	int f = 4;
-	float a = 1.0f;
-	for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-		int dy = i / (2*r+1) - r;
-		int dx = i % (2*r+1) - r;
-
-		int local_rect[4] = {max(0, -dx), max(0, -dy),
-		                     task->reconstruction_state.source_w - max(0, dx),
-		                     task->reconstruction_state.source_h - max(0, dy)};
-
-		kernel_set_args(ckNLMCalcDifference, 0,
-		                dx, dy,
-		                color_mem,
-		                color_variance_mem,
-		                difference,
-		                local_rect,
-		                task->buffer.w,
-		                task->buffer.pass_stride,
-		                a, task->nlm_k_2);
-		enqueue_kernel(ckNLMCalcDifference,
-		               task->reconstruction_state.source_w,
-		               task->reconstruction_state.source_h);
-
-		kernel_set_args(ckNLMBlur, 0,
-		                difference,
-		                blurDifference,
-		                local_rect,
-		                task->buffer.w,
-		                f);
-		enqueue_kernel(ckNLMBlur,
-		               task->reconstruction_state.source_w,
-		               task->reconstruction_state.source_h);
-
-		kernel_set_args(ckNLMCalcWeight, 0,
-		                blurDifference,
-		                difference,
-		                local_rect,
-		                task->buffer.w,
-		                f);
-		enqueue_kernel(ckNLMCalcWeight,
-		               task->reconstruction_state.source_w,
-		               task->reconstruction_state.source_h);
-
-		/* Reuse previous arguments. */
-		enqueue_kernel(ckNLMBlur,
-		               task->reconstruction_state.source_w,
-		               task->reconstruction_state.source_h);
-
-		kernel_set_args(ckNLMConstructGramian, 0,
-		                dx, dy,
-		                blurDifference,
-		                buffer_mem,
-		                transform_mem,
-		                rank_mem,
-		                XtWX_mem,
-		                XtWY_mem,
-		                local_rect,
-		                task->reconstruction_state.filter_rect,
-		                task->buffer.w,
-		                task->buffer.h,
-		                f,
-	                    task->buffer.pass_stride);
-		enqueue_kernel(ckNLMConstructGramian,
-		               task->reconstruction_state.source_w,
-		               task->reconstruction_state.source_h,
-		               256);
-	}
+	int w = task->reconstruction_state.source_w;
+	int h = task->reconstruction_state.source_h;
+	int stride = task->buffer.stride;
+
+	int shift_stride = stride*h;
+	int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
+	int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+	cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+	opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+	cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+	opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+
+	kernel_set_args(ckNLMCalcDifference, 0,
+	                color_mem,
+	                color_variance_mem,
+	                difference,
+	                w, h, stride,
+	                shift_stride,
+	                task->radius,
+	                task->buffer.pass_stride,
+	                1.0f, task->nlm_k_2);
+	kernel_set_args(ckNLMBlur, 0,
+	                difference,
+	                blurDifference,
+	                w, h, stride,
+	                shift_stride,
+	                task->radius, 4);
+	kernel_set_args(ckNLMCalcWeight, 0,
+	                blurDifference,
+	                difference,
+	                w, h, stride,
+	                shift_stride,
+	                task->radius, 4);
+	kernel_set_args(ckNLMConstructGramian, 0,
+	                blurDifference,
+	                buffer_mem,
+	                transform_mem,
+	                rank_mem,
+	                XtWX_mem,
+	                XtWY_mem,
+	                task->reconstruction_state.filter_window,
+	                w, h, stride,
+	                shift_stride,
+	                task->radius, 4,
+	                task->buffer.pass_stride);
+
+	enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
+	enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
+	enqueue_kernel(ckNLMCalcWeight,       w*h, num_shifts, true);
+	enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
+	enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
+
+	opencl_assert(clReleaseMemObject(difference));
+	opencl_assert(clReleaseMemObject(blurDifference));
 
 	kernel_set_args(ckFinalize, 0,
-	                task->buffer.w,
-	                task->buffer.h,
 	                output_mem,
 	                rank_mem,
 	                XtWX_mem,
@@ -919,9 +930,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
 	                task->filter_area,
 	                task->reconstruction_state.buffer_params,
 	                task->render_buffer.samples);
-	enqueue_kernel(ckFinalize,
-	               task->reconstruction_state.source_w,
-	               task->reconstruction_state.source_h);
+	enqueue_kernel(ckFinalize, w, h);
 
 	return true;
 }
author	Lukas Stockner <lukas.stockner@freenet.de>	2017-11-10 06:34:14 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2017-11-30 09:37:08 +0300
commit	fa3d50af95fde76ef08590d2f86444f2f9fdca95 (patch)
tree	516ea6cce9b6b3708389ad182a7dddf2974a1a10 /intern/cycles/device
parent	df7b9fa2eeb5908de4e1b3c2c6f7cf30329f1e3d (diff)