Cycles: Implement denoising option for reducing noise in the rendered image

This commit contains the first part of the new Cycles denoising option, which filters the resulting image using information gathered during rendering to get rid of noise while preserving visual features as well as possible. To use the option, enable it in the render layer options. The default settings fit a wide range of scenes, but the user can tweak individual settings to control the tradeoff between a noise-free image, image details, and calculation time. Note that the denoiser may still change in the future and that some features are not implemented yet. The most important missing feature is animation denoising, which uses information from multiple frames at once to produce a flicker-free and smoother result. These features will be added in the future. Finally, thanks to all the people who supported this project: - Google (through the GSoC) and Theory Studios for sponsoring the development - The authors of the papers I used for implementing the denoiser (more details on them will be included in the technical docs) - The other Cycles devs for feedback on the code, especially Sergey for mentoring the GSoC project and Brecht for the code review! - And of course the users who helped with testing, reported bugs and things that could and/or should work better!
author: Lukas Stockner <lukas.stockner@freenet.de> 2017-05-07 15:40:58 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2017-05-07 15:40:58 +0300
commit: 43b374e8c5430488a302298b1026faa1c3a231e9 (patch)
tree: 42e619a9fa08d02cef515b6315ce34dd7fd062b2 /intern/cycles/device/device_cuda.cpp
parent: bca697834728fd12c84941aa2a428abfe2090b27 (diff)
1 files changed, 448 insertions, 47 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index a971170318e..968ee5bc487 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,11 +21,14 @@
 #include <string.h>
 
 #include "device/device.h"
+#include "device/device_denoising.h"
 #include "device/device_intern.h"
 #include "device/device_split_kernel.h"
 
 #include "render/buffers.h"
 
+#include "kernel/filter/filter_defines.h"
+
 #ifdef WITH_CUDA_DYNLOAD
 #  include "cuew.h"
 #else
@@ -113,7 +116,7 @@ public:
 	DedicatedTaskPool task_pool;
 	CUdevice cuDevice;
 	CUcontext cuContext;
-	CUmodule cuModule;
+	CUmodule cuModule, cuFilterModule;
 	map<device_ptr, bool> tex_interp_map;
 	map<device_ptr, uint> tex_bindless_map;
 	int cuDevId;
@@ -170,7 +173,7 @@ public:
 		CUresult result = stmt; \
 		\
 		if(result != CUDA_SUCCESS) { \
-			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -301,7 +304,8 @@ public:
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features, bool split=false)
+	        const DeviceRequestedFeatures& requested_features,
+	        bool filter=false, bool split=false)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
@@ -316,7 +320,7 @@ public:
 		                              machine,
 		                              cuda_version,
 		                              include_path.c_str());
-		if(use_adaptive_compilation()) {
+		if(!filter && use_adaptive_compilation()) {
 			cflags += " " + requested_features.get_build_options();
 		}
 		const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -364,8 +368,22 @@ public:
 		return true;
 	}
 
-	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features,
+	                      bool filter=false, bool split=false)
 	{
+		const char *name, *source;
+		if(filter) {
+			name = "filter";
+			source = "filter.cu";
+		}
+		else if(split) {
+			name = "kernel_split";
+			source = "kernel_split.cu";
+		}
+		else {
+			name = "kernel";
+			source = "kernel.cu";
+		}
 		/* Compute cubin name. */
 		int major, minor;
 		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -373,9 +391,8 @@ public:
 
 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
-			                                                  : "lib/kernel_sm_%d%d.cubin",
-			                                            major, minor));
+			const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+			                                            name, major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
 				VLOG(1) << "Using precompiled kernel.";
@@ -384,7 +401,7 @@ public:
 		}
 
 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features, split);
+		        compile_kernel_get_common_cflags(requested_features, filter, split);
 
 		/* Try to use locally compiled kernel. */
 		const string source_path = path_get("source");
@@ -395,9 +412,8 @@ public:
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
 
-		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
-		                                              : "cycles_kernel_sm%d%d_%s.cubin",
-		                                        major, minor,
+		const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+		                                        name, major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
 		VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -432,7 +448,7 @@ public:
 		const string kernel = path_join(
 		        path_join(source_path, "kernel"),
 		        path_join("kernels",
-		                  path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
+		                  path_join("cuda", source)));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
 
@@ -480,11 +496,14 @@ public:
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel(requested_features, use_split_kernel());
-
+		string cubin = compile_kernel(requested_features, false, use_split_kernel());
 		if(cubin == "")
 			return false;
 
+		string filter_cubin = compile_kernel(requested_features, true, false);
+		if(filter_cubin == "")
+			return false;
+
 		/* open module */
 		cuda_push_context();
 
@@ -499,6 +518,14 @@ public:
 		if(cuda_error_(result, "cuModuleLoad"))
 			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
 
+		if(path_read_text(filter_cubin, cubin_data))
+			result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+		else
+			result = CUDA_ERROR_FILE_NOT_FOUND;
+
+		if(cuda_error_(result, "cuModuleLoad"))
+			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
 		cuda_pop_context();
 
 		return (result == CUDA_SUCCESS);
@@ -581,6 +608,11 @@ public:
 		}
 	}
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+	{
+		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+	}
+
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
 		CUdeviceptr mem;
@@ -881,6 +913,368 @@ public:
 		}
 	}
 
+	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+	{
+		mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+		for(int i = 0; i < 9; i++) {
+			tiles->buffers[i] = buffers[i];
+		}
+
+		mem_copy_to(task->tiles_mem);
+
+		return !have_error();
+	}
+
+#define CUDA_GET_BLOCKSIZE(func, w, h)                                                                          \
+			int threads_per_block;                                                                              \
+			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+			int threads = (int)sqrt((float)threads_per_block);                                                  \
+			int xblocks = ((w) + threads - 1)/threads;                                                          \
+			int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args)                      \
+			cuda_assert(cuLaunchKernel(func,                \
+			                           xblocks, yblocks, 1, \
+			                           threads, threads, 1, \
+			                           0, 0, args, 0));
+
+	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+	                               DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		int4 rect = task->rect;
+		int w = rect.z-rect.x;
+		int h = rect.w-rect.y;
+		int r = task->nlm_state.r;
+		int f = task->nlm_state.f;
+		float a = task->nlm_state.a;
+		float k_2 = task->nlm_state.k_2;
+
+		CUdeviceptr difference     = task->nlm_state.temporary_1_ptr;
+		CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+		CUdeviceptr weightAccum    = task->nlm_state.temporary_3_ptr;
+
+		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+
+		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+		cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+		cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+		cuda_assert(cuModuleGetFunction(&cuNLMNormalize,      cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize,      CU_FUNC_CACHE_PREFER_L1));
+
+		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+
+		int dx, dy;
+		int4 local_rect;
+		int channel_offset = 0;
+		void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
+		void *blur_args[]            = {&difference, &blurDifference, &local_rect, &w, &f};
+		void *calc_weight_args[]     = {&blurDifference, &difference, &local_rect, &w, &f};
+		void *update_output_args[]   = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
+
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			dy = i / (2*r+1) - r;
+			dx = i % (2*r+1) - r;
+			local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
+		}
+
+		local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+		void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
+		CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_construct_transform(DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterConstructTransform;
+		cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+		CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+		                   task->storage.w,
+		                   task->storage.h);
+
+		void *args[] = {&task->buffer.mem.device_pointer,
+		                &task->storage.transform.device_pointer,
+		                &task->storage.rank.device_pointer,
+		                &task->filter_area,
+		                &task->rect,
+		                &task->radius,
+		                &task->pca_threshold,
+		                &task->buffer.pass_stride};
+		CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr guide_ptr,
+	                           device_ptr guide_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		mem_zero(task->storage.XtWX);
+		mem_zero(task->storage.XtWY);
+
+		cuda_push_context();
+
+		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+		cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+		cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+		cuda_assert(cuModuleGetFunction(&cuFinalize,            cuFilterModule, "kernel_cuda_filter_finalize"));
+
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+		cuda_assert(cuFuncSetCacheConfig(cuFinalize,            CU_FUNC_CACHE_PREFER_L1));
+
+		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
+		                   task->reconstruction_state.source_w,
+		                   task->reconstruction_state.source_h);
+
+		CUdeviceptr difference     = task->reconstruction_state.temporary_1_ptr;
+		CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+
+		int r = task->radius;
+		int f = 4;
+		float a = 1.0f;
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy),
+			                     task->reconstruction_state.source_w - max(0, dx),
+			                     task->reconstruction_state.source_h - max(0, dy)};
+
+			void *calc_difference_args[] = {&dx, &dy,
+			                                &guide_ptr,
+			                                &guide_variance_ptr,
+			                                &difference,
+			                                &local_rect,
+			                                &task->buffer.w,
+			                                &task->buffer.pass_stride,
+			                                &a,
+			                                &task->nlm_k_2};
+			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+
+			void *blur_args[] = {&difference,
+			                     &blurDifference,
+			                     &local_rect,
+			                     &task->buffer.w,
+			                     &f};
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+			void *calc_weight_args[] = {&blurDifference,
+			                            &difference,
+			                            &local_rect,
+			                            &task->buffer.w,
+			                            &f};
+			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+
+			/* Reuse previous arguments. */
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+			void *construct_gramian_args[] = {&dx, &dy,
+			                                  &blurDifference,
+			                                  &task->buffer.mem.device_pointer,
+			                                  &color_ptr,
+			                                  &color_variance_ptr,
+			                                  &task->storage.transform.device_pointer,
+			                                  &task->storage.rank.device_pointer,
+			                                  &task->storage.XtWX.device_pointer,
+			                                  &task->storage.XtWY.device_pointer,
+			                                  &local_rect,
+			                                  &task->reconstruction_state.filter_rect,
+			                                  &task->buffer.w,
+			                                  &task->buffer.h,
+			                                  &f,
+		                                      &task->buffer.pass_stride};
+			CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
+		}
+
+		void *finalize_args[] = {&task->buffer.w,
+		                         &task->buffer.h,
+		                         &output_ptr,
+				                 &task->storage.rank.device_pointer,
+				                 &task->storage.XtWX.device_pointer,
+				                 &task->storage.XtWY.device_pointer,
+				                 &task->filter_area,
+				                 &task->reconstruction_state.buffer_params.x,
+				                 &task->render_buffer.samples};
+		CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+	                              device_ptr mean_ptr, device_ptr variance_ptr,
+	                              int r, int4 rect, DenoisingTask *task)
+	{
+		(void) task;
+
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterCombineHalves;
+		cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&mean_ptr,
+		                &variance_ptr,
+		                &a_ptr,
+		                &b_ptr,
+		                &rect,
+		                &r};
+		CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
+	{
+		(void) task;
+
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterDivideShadow;
+		cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		bool use_split_variance = use_split_kernel();
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+		                &a_ptr,
+		                &b_ptr,
+		                &sample_variance_ptr,
+		                &sv_variance_ptr,
+		                &buffer_variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset,
+		                &use_split_variance};
+		CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterGetFeature;
+		cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		bool use_split_variance = use_split_kernel();
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+				        &mean_offset,
+				        &variance_offset,
+		                &mean_ptr,
+		                &variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset,
+		                &use_split_variance};
+		CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	void denoise(RenderTile &rtile, const DeviceTask &task)
+	{
+		DenoisingTask denoising(this);
+
+		denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+		denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+		denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+		denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+		denoising.render_buffer.samples = rtile.sample;
+
+		RenderTile rtiles[9];
+		rtiles[4] = rtile;
+		task.map_neighbor_tiles(rtiles, this);
+		denoising.tiles_from_rendertiles(rtiles);
+
+		denoising.init_from_devicetask(task);
+
+		denoising.run_denoising();
+
+		task.unmap_neighbor_tiles(rtiles, this);
+	}
+
 	void path_trace(RenderTile& rtile, int sample, bool branched)
 	{
 		if(have_error())
@@ -1305,7 +1699,7 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE) {
+		if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 
 			bool branched = task->integrator_branched;
@@ -1313,30 +1707,8 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();
 
-			if(!use_split_kernel()) {
-				/* keep rendering tiles until done */
-				while(task->acquire_tile(this, tile)) {
-					int start_sample = tile.start_sample;
-					int end_sample = tile.start_sample + tile.num_samples;
-
-					for(int sample = start_sample; sample < end_sample; sample++) {
-						if(task->get_cancel()) {
-							if(task->need_finish_queue == false)
-								break;
-						}
-
-						path_trace(tile, sample, branched);
-
-						tile.sample = sample + 1;
-
-						task->update_progress(&tile, tile.w*tile.h);
-					}
-
-					task->release_tile(tile);
-				}
-			}
-			else {
-				DeviceRequestedFeatures requested_features;
+			DeviceRequestedFeatures requested_features;
+			if(use_split_kernel()) {
 				if(!use_adaptive_compilation()) {
 					requested_features.max_closure = 64;
 				}
@@ -1345,18 +1717,47 @@ public:
 					split_kernel = new CUDASplitKernel(this);
 					split_kernel->load_kernels(requested_features);
 				}
+			}
+
+			/* keep rendering tiles until done */
+			while(task->acquire_tile(this, tile)) {
+				if(tile.task == RenderTile::PATH_TRACE) {
+					if(use_split_kernel()) {
+						device_memory void_buffer;
+						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+					}
+					else {
+						int start_sample = tile.start_sample;
+						int end_sample = tile.start_sample + tile.num_samples;
+
+						for(int sample = start_sample; sample < end_sample; sample++) {
+							if(task->get_cancel()) {
+								if(task->need_finish_queue == false)
+									break;
+							}
 
-				while(task->acquire_tile(this, tile)) {
-					device_memory void_buffer;
-					split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+							path_trace(tile, sample, branched);
 
-					task->release_tile(tile);
+							tile.sample = sample + 1;
 
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
+							task->update_progress(&tile, tile.w*tile.h);
+						}
 					}
 				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+
+					denoise(tile, *task);
+
+					task->update_progress(&tile, tile.w*tile.h);
+				}
+
+				task->release_tile(tile);
+
+				if(task->get_cancel()) {
+					if(task->need_finish_queue == false)
+						break;
+				}
 			}
 		}
 		else if(task->type == DeviceTask::SHADER) {
author	Lukas Stockner <lukas.stockner@freenet.de>	2017-05-07 15:40:58 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2017-05-07 15:40:58 +0300
commit	43b374e8c5430488a302298b1026faa1c3a231e9 (patch)
tree	42e619a9fa08d02cef515b6315ce34dd7fd062b2 /intern/cycles/device/device_cuda.cpp
parent	bca697834728fd12c84941aa2a428abfe2090b27 (diff)