From 405cacd4cd955552e1f7b50a176ddcdd9baf8d3b Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Wed, 6 Feb 2019 12:42:10 +0100 Subject: Cycles: prefilter feature passes separate from denoising. Prefiltering of feature passes will happen during rendering, which can then be used for denoising immediately or written as a render pass for later (animation) denoising. The number of denoising data passes written is reduced because of this, leaving out the feature variance passes. The passes are now Normal, Albedo, Depth, Shadowing, Variance and Intensity. Ref D3889. --- intern/cycles/blender/addon/engine.py | 9 +- intern/cycles/blender/blender_session.cpp | 17 ++- intern/cycles/blender/blender_sync.cpp | 35 ++--- intern/cycles/device/device_cpu.cpp | 71 ++++++--- intern/cycles/device/device_cuda.cpp | 169 +++++++++++++-------- intern/cycles/device/device_denoising.cpp | 72 +++++++-- intern/cycles/device/device_denoising.h | 22 ++- intern/cycles/device/device_task.h | 6 + intern/cycles/device/opencl/opencl.h | 15 +- intern/cycles/device/opencl/opencl_base.cpp | 71 +++++++-- intern/cycles/kernel/filter/filter_defines.h | 1 + intern/cycles/kernel/filter/filter_nlm_cpu.h | 19 ++- intern/cycles/kernel/filter/filter_nlm_gpu.h | 36 +++-- intern/cycles/kernel/filter/filter_prefilter.h | 39 +++-- .../cycles/kernel/filter/filter_reconstruction.h | 12 +- intern/cycles/kernel/kernel_types.h | 9 ++ intern/cycles/kernel/kernels/cpu/filter_cpu.h | 14 +- intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h | 56 ++++++- intern/cycles/kernel/kernels/cuda/filter.cu | 36 ++++- intern/cycles/kernel/kernels/opencl/filter.cl | 34 ++++- intern/cycles/render/buffers.cpp | 123 ++++++++------- intern/cycles/render/buffers.h | 5 + intern/cycles/render/film.cpp | 4 + intern/cycles/render/film.h | 1 + intern/cycles/render/session.cpp | 11 +- intern/cycles/render/session.h | 10 +- 26 files changed, 644 insertions(+), 253 deletions(-) (limited to 'intern/cycles') diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 23239ee4352..83b9a8eee0c 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -269,14 +269,11 @@ def register_passes(engine, scene, srl): engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR') if crl.denoising_store_passes: engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR') engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR') - engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR') engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE') - engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE') - engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Shadowing", 1, "X", 'VALUE') + engine.register_pass(scene, srl, "Denoising Variance", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Intensity", 1, "X", 'VALUE') clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect", "denoising_glossy_direct", "denoising_glossy_indirect", "denoising_transmission_direct", "denoising_transmission_indirect", diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index dfa92dd1bc7..50ac35069a9 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -418,15 +418,19 @@ void BlenderSession::render() buffer_params.passes = passes; PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles"); - bool use_denoising = get_boolean(crl, "use_denoising"); - bool denoising_passes = use_denoising || get_boolean(crl, "denoising_store_passes"); + bool full_denoising = get_boolean(crl, "use_denoising"); + bool write_denoising_passes = get_boolean(crl, "denoising_store_passes"); - session->tile_manager.schedule_denoising = use_denoising; - buffer_params.denoising_data_pass = denoising_passes; + bool run_denoising = full_denoising || write_denoising_passes; + + session->tile_manager.schedule_denoising = run_denoising; + buffer_params.denoising_data_pass = run_denoising; buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES); + buffer_params.denoising_prefiltered_pass = write_denoising_passes; - session->params.use_denoising = use_denoising; - session->params.denoising_passes = denoising_passes; + session->params.run_denoising = run_denoising; + session->params.full_denoising = full_denoising; + session->params.write_denoising_passes = write_denoising_passes; session->params.denoising_radius = get_int(crl, "denoising_radius"); session->params.denoising_strength = get_float(crl, "denoising_strength"); session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength"); @@ -434,6 +438,7 @@ void BlenderSession::render() scene->film->denoising_data_pass = buffer_params.denoising_data_pass; scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass; + scene->film->denoising_prefiltered_pass = buffer_params.denoising_prefiltered_pass; scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold(); scene->film->tag_passes_update(scene, passes); scene->film->tag_update(scene); diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 703fcc2078b..a6050b66040 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -531,7 +531,7 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) { string name = b_pass.name(); - if(name == "Noisy Image") return DENOISING_PASS_COLOR; + if(name == "Noisy Image") return DENOISING_PASS_PREFILTERED_COLOR; if(name.substr(0, 10) != "Denoising ") { return -1; @@ -539,15 +539,12 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) name = name.substr(10); #define MAP_PASS(passname, offset) if(name == passname) return offset; - MAP_PASS("Normal", DENOISING_PASS_NORMAL); - MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR); - MAP_PASS("Albedo", DENOISING_PASS_ALBEDO); - MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR); - MAP_PASS("Depth", DENOISING_PASS_DEPTH); - MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR); - MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A); - MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B); - MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR); + MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL); + MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO); + MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH); + MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING); + MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE); + MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY); MAP_PASS("Clean", DENOISING_PASS_CLEAN); #undef MAP_PASS @@ -579,10 +576,11 @@ vector BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, } PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles"); - bool use_denoising = get_boolean(crp, "use_denoising"); - bool store_denoising_passes = get_boolean(crp, "denoising_store_passes"); + bool full_denoising = get_boolean(crp, "use_denoising"); + bool write_denoising_passes = get_boolean(crp, "denoising_store_passes"); + scene->film->denoising_flags = 0; - if(use_denoising || store_denoising_passes) { + if(full_denoising || write_denoising_passes) { #define MAP_OPTION(name, flag) if(!get_boolean(crp, name)) scene->film->denoising_flags |= flag; MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR); MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND); @@ -596,16 +594,13 @@ vector BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, b_engine.add_pass("Noisy Image", 4, "RGBA", b_srlay.name().c_str()); } - if(store_denoising_passes) { + if(write_denoising_passes) { b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str()); b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str()); b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Shadowing", 1, "X", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Variance", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Intensity", 1, "X", b_srlay.name().c_str()); if(scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) { b_engine.add_pass("Denoising Clean", 3, "RGB", b_srlay.name().c_str()); diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 16908b0244a..6668acc9cbe 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -180,16 +180,17 @@ public: KernelFunctions convert_to_byte_kernel; KernelFunctions shader_kernel; - KernelFunctions filter_divide_shadow_kernel; - KernelFunctions filter_get_feature_kernel; + KernelFunctions filter_divide_shadow_kernel; + KernelFunctions filter_get_feature_kernel; + KernelFunctions filter_write_feature_kernel; KernelFunctions filter_detect_outliers_kernel; KernelFunctions filter_combine_halves_kernel; - KernelFunctions filter_nlm_calc_difference_kernel; - KernelFunctions filter_nlm_blur_kernel; - KernelFunctions filter_nlm_calc_weight_kernel; - KernelFunctions filter_nlm_update_output_kernel; - KernelFunctions filter_nlm_normalize_kernel; + KernelFunctions filter_nlm_calc_difference_kernel; + KernelFunctions filter_nlm_blur_kernel; + KernelFunctions filter_nlm_calc_weight_kernel; + KernelFunctions filter_nlm_update_output_kernel; + KernelFunctions filter_nlm_normalize_kernel; KernelFunctions filter_construct_transform_kernel; KernelFunctions filter_nlm_construct_gramian_kernel; @@ -218,6 +219,7 @@ public: REGISTER_KERNEL(shader), REGISTER_KERNEL(filter_divide_shadow), REGISTER_KERNEL(filter_get_feature), + REGISTER_KERNEL(filter_write_feature), REGISTER_KERNEL(filter_detect_outliers), REGISTER_KERNEL(filter_combine_halves), REGISTER_KERNEL(filter_nlm_calc_difference), @@ -487,6 +489,8 @@ public: int w = align_up(rect.z-rect.x, 4); int h = rect.w-rect.y; + int stride = task->buffer.stride; + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer; float *blurDifference = temporary_mem; @@ -504,9 +508,10 @@ public: filter_nlm_calc_difference_kernel()(dx, dy, (float*) guide_ptr, (float*) variance_ptr, + NULL, difference, local_rect, - w, 0, + w, channel_offset, a, k_2); filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); @@ -520,7 +525,8 @@ public: (float*) out_ptr, weightAccum, local_rect, - w, f); + channel_offset, + stride, f); } int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; @@ -550,16 +556,13 @@ public: return true; } - bool denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task) + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + DenoisingTask *task) { ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); - mem_zero(task->storage.XtWX); - mem_zero(task->storage.XtWY); - float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer; float *difference = temporary_mem; float *blurDifference = temporary_mem + task->buffer.pass_stride; @@ -575,6 +578,7 @@ public: filter_nlm_calc_difference_kernel()(dx, dy, (float*) color_ptr, (float*) color_variance_ptr, + (float*) scale_ptr, difference, local_rect, task->buffer.stride, @@ -597,6 +601,13 @@ public: 4, task->buffer.pass_stride); } + + return true; + } + + bool denoising_solve(device_ptr output_ptr, + DenoisingTask *task) + { for(int y = 0; y < task->filter_area.w; y++) { for(int x = 0; x < task->filter_area.z; x++) { filter_finalize_kernel()(x, @@ -661,6 +672,7 @@ public: int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task) { ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE); @@ -674,6 +686,7 @@ public: x, y, (float*) mean_ptr, (float*) variance_ptr, + scale, &task->rect.x, task->render_buffer.pass_stride, task->render_buffer.offset); @@ -682,6 +695,26 @@ public: return true; } + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) + { + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_write_feature_kernel()(task->render_buffer.samples, + x + task->filter_area.x, + y + task->filter_area.y, + &task->reconstruction_state.buffer_params.x, + (float*) from_ptr, + (float*) buffer_ptr, + out_offset, + &task->rect.x); + } + } + return true; + } + bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, @@ -754,11 +787,13 @@ public: tile.sample = tile.start_sample + tile.num_samples; denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 7b3c25a86d5..cb7d8bbb224 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1300,7 +1300,7 @@ public: int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); - int channel_offset = 0; + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; if(have_error()) return false; @@ -1308,6 +1308,7 @@ public: CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts; + CUdeviceptr scale_ptr = 0; cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride)); cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride)); @@ -1326,10 +1327,10 @@ public: CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); - void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2}; + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1379,19 +1380,16 @@ public: return !have_error(); } - bool denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task) + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + DenoisingTask *task) { if(have_error()) return false; CUDAContextScope scope(this); - mem_zero(task->storage.XtWX); - mem_zero(task->storage.XtWY); - int r = task->radius; int f = 4; float a = 1.0f; @@ -1410,60 +1408,69 @@ public: CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; - { - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, - task->reconstruction_state.source_w * task->reconstruction_state.source_h, - num_shifts); - - void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&blurDifference, - &task->buffer.mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->reconstruction_state.filter_window, - &w, &h, &stride, - &pass_stride, &r, - &f}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); - } + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, + &color_variance_ptr, + &scale_ptr, + &difference, + &w, &h, + &stride, &pass_stride, + &r, &pass_stride, + &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *construct_gramian_args[] = {&blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->reconstruction_state.filter_window, + &w, &h, &stride, + &pass_stride, &r, + &f}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); + cuda_assert(cuCtxSynchronize()); - { - CUfunction cuFinalize; - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - void *finalize_args[] = {&output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_GET_BLOCKSIZE(cuFinalize, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); - } + return !have_error(); + } + bool denoising_solve(device_ptr output_ptr, + DenoisingTask *task) + { + CUfunction cuFinalize; + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE(cuFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); cuda_assert(cuCtxSynchronize()); return !have_error(); @@ -1533,6 +1540,7 @@ public: int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task) { if(have_error()) @@ -1553,6 +1561,7 @@ public: &variance_offset, &mean_ptr, &variance_ptr, + &scale, &task->rect, &task->render_buffer.pass_stride, &task->render_buffer.offset}; @@ -1562,6 +1571,36 @@ public: return !have_error(); } + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterWriteFeature; + cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, + task->filter_area.z, + task->filter_area.w); + + void *args[] = {&task->render_buffer.samples, + &task->reconstruction_state.buffer_params, + &task->filter_area, + &from_ptr, + &buffer_ptr, + &out_offset, + &task->rect}; + CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, @@ -1596,11 +1635,13 @@ public: void denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 433cbd3c265..724171c3acb 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -39,11 +39,18 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) render_buffer.pass_stride = task.pass_stride; render_buffer.offset = task.pass_denoising_data; - target_buffer.pass_stride = task.pass_stride; + target_buffer.pass_stride = task.target_pass_stride; target_buffer.denoising_clean_offset = task.pass_denoising_clean; + target_buffer.offset = 0; functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device); functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device); + + tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); + tile_info->from_render = task.denoising_from_render? 1 : 0; + + write_passes = task.denoising_write_passes; + do_filter = task.denoising_do_filter; } DenoisingTask::~DenoisingTask() @@ -59,8 +66,6 @@ DenoisingTask::~DenoisingTask() void DenoisingTask::set_render_buffer(RenderTile *rtiles) { - tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); - for(int i = 0; i < 9; i++) { tile_info->offsets[i] = rtiles[i].offset; tile_info->strides[i] = rtiles[i].stride; @@ -79,6 +84,13 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles) target_buffer.stride = rtiles[9].stride; target_buffer.ptr = rtiles[9].buffer; + if(write_passes && rtiles[9].buffers) { + target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset(); + } + else { + target_buffer.denoising_output_offset = 0; + } + tile_info_mem.copy_to_device(); } @@ -89,7 +101,8 @@ void DenoisingTask::setup_denoising_buffer() rect = rect_expand(rect, radius); rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - buffer.passes = 14; + buffer.use_intensity = write_passes; + buffer.passes = buffer.use_intensity? 15 : 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; @@ -129,14 +142,14 @@ void DenoisingTask::prefilter_shadowing() functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ - nlm_state.set_parameters(6, 3, 4.0f, 1.0f); + nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false); functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); /* Reuse memory, the previous data isn't needed anymore. */ device_ptr filtered_a = *buffer_var, filtered_b = *sample_var; /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ - nlm_state.set_parameters(5, 3, 1.0f, 0.25f); + nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false); functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); @@ -147,7 +160,7 @@ void DenoisingTask::prefilter_shadowing() device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b; /* Use the residual variance for a second filter pass. */ - nlm_state.set_parameters(4, 2, 1.0f, 0.5f); + nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false); functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); @@ -167,9 +180,9 @@ void DenoisingTask::prefilter_features() for(int pass = 0; pass < 7; pass++) { device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride); /* Get the unfiltered pass and its variance from the RenderBuffers. */ - functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance); + functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples); /* Smooth the pass and store the result in the denoising buffers. */ - nlm_state.set_parameters(2, 2, 1.0f, 0.25f); + nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false); functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); } } @@ -188,13 +201,33 @@ void DenoisingTask::prefilter_color() for(int pass = 0; pass < num_color_passes; pass++) { device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride); device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride); - functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass); + functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples); } device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride); device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride); functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); + + if(buffer.use_intensity) { + device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); + nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true); + functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass); + } +} + +void DenoisingTask::write_buffer() +{ + reconstruction_state.buffer_params = make_int4(target_buffer.offset, + target_buffer.stride, + target_buffer.pass_stride, + target_buffer.denoising_clean_offset); + int num_passes = buffer.use_intensity? 15 : 14; + for(int pass = 0; pass < num_passes; pass++) { + device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride); + int out_offset = pass + target_buffer.denoising_output_offset; + functions.write_feature(out_offset, *from_pass, target_buffer.ptr); + } } void DenoisingTask::construct_transform() @@ -212,6 +245,8 @@ void DenoisingTask::reconstruct() { storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false); storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false); + storage.XtWX.zero_to_device(); + storage.XtWY.zero_to_device(); reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x; @@ -224,7 +259,12 @@ void DenoisingTask::reconstruct() device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); - functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr); + + device_ptr scale_ptr = 0; + device_sub_ptr *scale_sub_ptr = NULL; + functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr); + delete scale_sub_ptr; + functions.solve(target_buffer.ptr); } void DenoisingTask::run_denoising(RenderTile *tile) @@ -240,8 +280,14 @@ void DenoisingTask::run_denoising(RenderTile *tile) prefilter_features(); prefilter_color(); - construct_transform(); - reconstruct(); + if(do_filter) { + construct_transform(); + reconstruct(); + } + + if(write_passes) { + write_buffer(); + } functions.unmap_neighbor_tiles(rtiles); } diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index beae60c220f..cddcd3bd0c9 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -47,6 +47,7 @@ public: int stride; int pass_stride; int denoising_clean_offset; + int denoising_output_offset; device_ptr ptr; } target_buffer; @@ -58,6 +59,9 @@ public: int4 rect; int4 filter_area; + bool write_passes; + bool do_filter; + struct DeviceFunctions { function non_local_means; function reconstruct; + device_ptr scale_ptr + )> accumulate; + function solve; function construct_transform; function get_feature; function detect_outliers; + function write_feature; function map_neighbor_tiles; function unmap_neighbor_tiles; } functions; @@ -114,8 +124,9 @@ public: int f; /* Patch size of the filter. */ float a; /* Variance compensation factor in the MSE estimation. */ float k_2; /* Squared value of the k parameter of the filter. */ + bool is_color; - void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; } + void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; } } nlm_state; struct Storage { @@ -147,6 +158,7 @@ public: int width; device_only_memory mem; device_only_memory temporary_mem; + bool use_intensity; bool gpu_temporary_mem; @@ -166,6 +178,8 @@ protected: void prefilter_color(); void construct_transform(); void reconstruct(); + + void write_buffer(); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 861014373b3..97bcde99af6 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -72,7 +72,13 @@ public: float denoising_strength; float denoising_feature_strength; bool denoising_relative_pca; + bool denoising_from_render; + + bool denoising_do_filter; + bool denoising_write_passes; + int pass_stride; + int target_pass_stride; int pass_denoising_data; int pass_denoising_clean; diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index ea7ed4f1909..4d42ddc0c53 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -419,10 +419,12 @@ protected: device_ptr out_ptr, DenoisingTask *task); bool denoising_construct_transform(DenoisingTask *task); - bool denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task); + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + DenoisingTask *task); + bool denoising_solve(device_ptr output_ptr, + DenoisingTask *task); bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, @@ -439,7 +441,12 @@ protected: int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task); + bool denoising_write_feature(int to_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task); bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index d4d7c0f74bc..a0a1cf68c32 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -748,6 +748,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); @@ -760,6 +761,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, cl_mem guide_mem = CL_MEM_PTR(guide_ptr); cl_mem variance_mem = CL_MEM_PTR(variance_ptr); cl_mem out_mem = CL_MEM_PTR(out_ptr); + cl_mem scale_mem = NULL; mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride); mem_zero_kernel(out_ptr, sizeof(float)*pass_stride); @@ -773,10 +775,12 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, kernel_set_args(ckNLMCalcDifference, 0, guide_mem, variance_mem, + scale_mem, difference_mem, w, h, stride, pass_stride, - r, 0, a, k_2); + r, channel_offset, + 0, a, k_2); kernel_set_args(ckNLMBlur, 0, difference_mem, blurDifference_mem, @@ -796,6 +800,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, weightAccum_mem, w, h, stride, pass_stride, + channel_offset, r, f); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); @@ -837,17 +842,14 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) return true; } -bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task) +bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + DenoisingTask *task) { - mem_zero(task->storage.XtWX); - mem_zero(task->storage.XtWY); - cl_mem color_mem = CL_MEM_PTR(color_ptr); cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); - cl_mem output_mem = CL_MEM_PTR(output_ptr); + cl_mem scale_mem = CL_MEM_PTR(scale_ptr); cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); @@ -859,7 +861,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); - cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; @@ -877,6 +878,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, kernel_set_args(ckNLMCalcDifference, 0, color_mem, color_variance_mem, + scale_mem, difference_mem, w, h, stride, pass_stride, @@ -913,6 +915,22 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); + return true; +} + +bool OpenCLDeviceBase::denoising_solve(device_ptr output_ptr, + DenoisingTask *task) +{ + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem output_mem = CL_MEM_PTR(output_ptr); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + kernel_set_args(ckFinalize, 0, output_mem, rank_mem, @@ -1000,6 +1018,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task) { cl_mem mean_mem = CL_MEM_PTR(mean_ptr); @@ -1023,6 +1042,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, variance_offset, mean_mem, variance_mem, + scale, task->rect, task->render_buffer.pass_stride, task->render_buffer.offset); @@ -1033,6 +1053,31 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, return true; } +bool OpenCLDeviceBase::denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) +{ + cl_mem from_mem = CL_MEM_PTR(from_ptr); + cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); + + cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); + + kernel_set_args(ckFilterWriteFeature, 0, + task->render_buffer.samples, + task->reconstruction_state.buffer_params, + task->filter_area, + from_mem, + buffer_mem, + out_offset, + task->rect); + enqueue_kernel(ckFilterWriteFeature, + task->filter_area.z, + task->filter_area.w); + + return true; +} + bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, @@ -1063,11 +1108,13 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&OpenCLDeviceBase::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h index 67f4e62ac0f..9ac7c3db23d 100644 --- a/intern/cycles/kernel/filter/filter_defines.h +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -27,6 +27,7 @@ typedef struct TileInfo { int strides[9]; int x[4]; int y[4]; + int from_render; /* TODO(lukas): CUDA doesn't have uint64_t... */ #ifdef __KERNEL_OPENCL__ ccl_global float *buffers[9]; diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index af73c0dadf2..0c4387af540 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, + const float *ccl_restrict scale_image, float *difference_image, int4 rect, int stride, @@ -41,13 +42,21 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, int idx_q = (y+dy)*stride + aligned_lowx + dx; for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { float4 diff = make_float4(0.0f); + float4 scale_fac; + if(scale_image) { + scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), + make_float4(0.25f), make_float4(4.0f)); + } + else { + scale_fac = make_float4(1.0f); + } for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { /* idx_p is guaranteed to be aligned, but idx_q isn't. */ float4 color_p = load4_a(weight_image, idx_p + chan_ofs); - float4 color_q = load4_u(weight_image, idx_q + chan_ofs); + float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs); float4 cdiff = color_p - color_q; float4 var_p = load4_a(variance_image, idx_p + chan_ofs); - float4 var_q = load4_u(variance_image, idx_q + chan_ofs); + float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs); diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); } load4_a(difference_image, idx_p) = diff*channel_fac; @@ -143,6 +152,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float *out_image, float *accum_image, int4 rect, + int channel_offset, int stride, int f) { @@ -160,6 +170,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, load4_a(accum_image, idx_p) += mask(active, weight); float4 val = load4_u(image, idx_q); + if(channel_offset) { + val += load4_u(image, idx_q + channel_offset); + val += load4_u(image, idx_q + 2*channel_offset); + val *= 1.0f/3.0f; + } load4_a(out_image, idx_p) += mask(active, weight*val); } diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index 058afb34a92..d8e2e4d08aa 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -78,17 +78,25 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, int dy, const ccl_global float *ccl_restrict weight_image, const ccl_global float *ccl_restrict variance_image, + const ccl_global float *ccl_restrict scale_image, ccl_global float *difference_image, int4 rect, int stride, int channel_offset, float a, float k_2) { - float diff = 0.0f; + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); int numChannels = channel_offset? 3 : 1; - for(int c = 0; c < numChannels; c++) { - float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - float pvar = variance_image[c*channel_offset + y*stride + x]; - float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + + float diff = 0.0f; + float scale_fac = 1.0f; + if(scale_image) { + scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); + } + + for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { + float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q]; + float pvar = variance_image[idx_p]; + float qvar = sqr(scale_fac)*variance_image[idx_q]; diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); } if(numChannels > 1) { @@ -133,7 +141,8 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, const ccl_global float *ccl_restrict image, ccl_global float *out_image, ccl_global float *accum_image, - int4 rect, int stride, int f) + int4 rect, int channel_offset, + int stride, int f) { float sum = 0.0f; const int low = max(rect.x, x-f); @@ -142,12 +151,21 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, sum += difference_image[y*stride + x1]; } sum *= 1.0f/(high-low); + + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); if(out_image) { - atomic_add_and_fetch_float(accum_image + y*stride + x, sum); - atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]); + atomic_add_and_fetch_float(accum_image + idx_p, sum); + + float val = image[idx_q]; + if(channel_offset) { + val += image[idx_q + channel_offset]; + val += image[idx_q + 2*channel_offset]; + val *= 1.0f/3.0f; + } + atomic_add_and_fetch_float(out_image + idx_p, sum*val); } else { - accum_image[y*stride + x] = sum; + accum_image[idx_p] = sum; } } diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 3507f80df46..41be4dbea49 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -84,6 +84,7 @@ ccl_device void kernel_filter_get_feature(int sample, int x, int y, ccl_global float *mean, ccl_global float *variance, + float scale, int4 rect, int buffer_pass_stride, int buffer_denoising_offset) { @@ -95,18 +96,38 @@ ccl_device void kernel_filter_get_feature(int sample, int buffer_w = align_up(rect.z - rect.x, 4); int idx = (y-rect.y)*buffer_w + (x - rect.x); - mean[idx] = center_buffer[m_offset] / sample; - if(sample > 1) { - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); - } - else { - /* Can't compute variance with single sample, just set it very high. */ - variance[idx] = 1e10f; + float val = scale * center_buffer[m_offset]; + mean[idx] = val; + + if(v_offset >= 0) { + if(sample > 1) { + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + variance[idx] = max(0.0f, (center_buffer[v_offset] - val*val*sample) / (sample * (sample-1))); + } + else { + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; + } } } +ccl_device void kernel_filter_write_feature(int sample, + int x, int y, + int4 buffer_params, + ccl_global float *from, + ccl_global float *buffer, + int out_offset, + int4 rect) +{ + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + combined_buffer[out_offset] = from[idx]; +} + ccl_device void kernel_filter_detect_outliers(int x, int y, ccl_global float *image, ccl_global float *variance, diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index 58740d5b06a..e5d3b0da835 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -108,11 +108,13 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; - final_color *= sample; - if(buffer_params.w) { - final_color.x += combined_buffer[buffer_params.w+0]; - final_color.y += combined_buffer[buffer_params.w+1]; - final_color.z += combined_buffer[buffer_params.w+2]; + if(buffer_params.w >= 0) { + final_color *= sample; + if(buffer_params.w > 0) { + final_color.x += combined_buffer[buffer_params.w+0]; + final_color.y += combined_buffer[buffer_params.w+1]; + final_color.z += combined_buffer[buffer_params.w+2]; + } } combined_buffer[0] = final_color.x; combined_buffer[1] = final_color.y; diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 864aa7c470a..caa0057d997 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -472,8 +472,17 @@ typedef enum DenoisingPassOffsets { DENOISING_PASS_COLOR_VAR = 23, DENOISING_PASS_CLEAN = 26, + DENOISING_PASS_PREFILTERED_DEPTH = 0, + DENOISING_PASS_PREFILTERED_NORMAL = 1, + DENOISING_PASS_PREFILTERED_SHADOWING = 4, + DENOISING_PASS_PREFILTERED_ALBEDO = 5, + DENOISING_PASS_PREFILTERED_COLOR = 8, + DENOISING_PASS_PREFILTERED_VARIANCE = 11, + DENOISING_PASS_PREFILTERED_INTENSITY = 14, + DENOISING_PASS_SIZE_BASE = 26, DENOISING_PASS_SIZE_CLEAN = 3, + DENOISING_PASS_SIZE_PREFILTERED = 15, } DenoisingPassOffsets; typedef enum eBakePassFilter { diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index e036b53b810..08333c7a455 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -37,10 +37,20 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, int y, float *mean, float *variance, + float scale, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset); +void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, + int x, + int y, + int *buffer_params, + float *from, + float *buffer, + int out_offset, + int* prefilter_rect); + void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, ccl_global float *image, ccl_global float *variance, @@ -71,7 +81,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int dy, float *weight_image, - float *variance, + float *variance_image, + float *scale_image, float *difference_image, int* rect, int stride, @@ -99,6 +110,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, float *out_image, float *accum_image, int* rect, + int channel_offset, int stride, int f); diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 4c758711481..b792367e3ab 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -69,6 +69,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, int x, int y, float *mean, float *variance, + float scale, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) @@ -80,12 +81,29 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, m_offset, v_offset, x, y, mean, variance, + scale, load_int4(prefilter_rect), buffer_pass_stride, buffer_denoising_offset); #endif } +void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, + int x, + int y, + int *buffer_params, + float *from, + float *buffer, + int out_offset, + int* prefilter_rect) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_write_feature); +#else + kernel_filter_write_feature(sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect)); +#endif +} + void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, ccl_global float *image, ccl_global float *variance, @@ -130,8 +148,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); #else - rank += storage_ofs; - transform += storage_ofs*TRANSFORM_SIZE; + rank += storage_ofs; + transform += storage_ofs*TRANSFORM_SIZE; kernel_filter_construct_transform(buffer, x, y, load_int4(prefilter_rect), @@ -146,7 +164,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int dy, float *weight_image, - float *variance, + float *variance_image, + float *scale_image, float *difference_image, int *rect, int stride, @@ -157,7 +176,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); #else - kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2); + kernel_filter_nlm_calc_difference(dx, dy, + weight_image, + variance_image, + scale_image, + difference_image, + load_int4(rect), + stride, + channel_offset, + a, k_2); #endif } @@ -195,13 +222,22 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, float *out_image, float *accum_image, int *rect, + int channel_offset, int stride, int f) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f); + kernel_filter_nlm_update_output(dx, dy, + difference_image, + image, + temp_image, + out_image, + accum_image, + load_int4(rect), + channel_offset, + stride, f); #endif } @@ -222,7 +258,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); #else - kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride); + kernel_filter_nlm_construct_gramian(dx, dy, + difference_image, + buffer, + transform, rank, + XtWX, XtWY, + load_int4(rect), + load_int4(filter_window), + stride, f, + pass_stride); #endif } diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index b856cbde45c..3b51bb41aed 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -64,6 +64,7 @@ kernel_cuda_filter_get_feature(int sample, int v_offset, float *mean, float *variance, + float scale, int4 prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) @@ -76,12 +77,37 @@ kernel_cuda_filter_get_feature(int sample, m_offset, v_offset, x, y, mean, variance, + scale, prefilter_rect, buffer_pass_stride, buffer_denoising_offset); } } +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_write_feature(int sample, + int4 buffer_params, + int4 filter_area, + float *from, + float *buffer, + int out_offset, + int4 prefilter_rect) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + kernel_filter_write_feature(sample, + x + filter_area.x, + y + filter_area.y, + buffer_params, + from, + buffer, + out_offset, + prefilter_rect); + } +} + extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_detect_outliers(float *image, @@ -136,6 +162,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, + const float *ccl_restrict scale_image, float *difference_image, int w, int h, @@ -152,9 +179,11 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, + scale_image, difference_image + ofs, rect, stride, - channel_offset, a, k_2); + channel_offset, + a, k_2); } } @@ -210,6 +239,7 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, int h, int stride, int pass_stride, + int channel_offset, int r, int f) { @@ -221,7 +251,9 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, image, out_image, accum_image, - rect, stride, f); + rect, + channel_offset, + stride, f); } } diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index a550f97f4eb..8a821ee281d 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -56,6 +56,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample, int v_offset, ccl_global float *mean, ccl_global float *variance, + float scale, int4 prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) @@ -68,12 +69,35 @@ __kernel void kernel_ocl_filter_get_feature(int sample, m_offset, v_offset, x, y, mean, variance, + scale, prefilter_rect, buffer_pass_stride, buffer_denoising_offset); } } +__kernel void kernel_ocl_filter_write_feature(int sample, + int4 buffer_params, + int4 filter_area, + ccl_global float *from, + ccl_global float *buffer, + int out_offset, + int4 prefilter_rect) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + kernel_filter_write_feature(sample, + x + filter_area.x, + y + filter_area.y, + buffer_params, + from, + buffer, + out_offset, + prefilter_rect); + } +} + __kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image, ccl_global float *variance, ccl_global float *depth, @@ -128,6 +152,7 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image, const ccl_global float *ccl_restrict variance_image, + const ccl_global float *ccl_restrict scale_image, ccl_global float *difference_image, int w, int h, @@ -144,9 +169,11 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, + scale_image, difference_image + ofs, rect, stride, - channel_offset, a, k_2); + channel_offset, + a, k_2); } } @@ -196,6 +223,7 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re int h, int stride, int pass_stride, + int channel_offset, int r, int f) { @@ -207,7 +235,9 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re image, out_image, accum_image, - rect, stride, f); + rect, + channel_offset, + stride, f); } } diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index f901885e679..66b8ef73acc 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -42,6 +42,7 @@ BufferParams::BufferParams() denoising_data_pass = false; denoising_clean_pass = false; + denoising_prefiltered_pass = false; Pass::add(PASS_COMBINED, passes); } @@ -73,6 +74,7 @@ int BufferParams::get_passes_size() if(denoising_data_pass) { size += DENOISING_PASS_SIZE_BASE; if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN; + if(denoising_prefiltered_pass) size += DENOISING_PASS_SIZE_PREFILTERED; } return align_up(size, 4); @@ -88,6 +90,20 @@ int BufferParams::get_denoising_offset() return offset; } +int BufferParams::get_denoising_prefiltered_offset() +{ + assert(denoising_prefiltered_pass); + + int offset = get_denoising_offset(); + + offset += DENOISING_PASS_SIZE_BASE; + if(denoising_clean_pass) { + offset += DENOISING_PASS_SIZE_CLEAN; + } + + return offset; +} + /* Render Buffer Task */ RenderTile::RenderTile() @@ -153,81 +169,62 @@ bool RenderBuffers::get_denoising_pass_rect(int type, float exposure, int sample return false; } - float invsample = 1.0f/sample; - float scale = invsample; - bool variance = (type == DENOISING_PASS_NORMAL_VAR) || - (type == DENOISING_PASS_ALBEDO_VAR) || - (type == DENOISING_PASS_DEPTH_VAR) || - (type == DENOISING_PASS_COLOR_VAR); + float scale = 1.0f; + float alpha_scale = 1.0f/sample; + if(type == DENOISING_PASS_PREFILTERED_COLOR || + type == DENOISING_PASS_CLEAN || + type == DENOISING_PASS_PREFILTERED_INTENSITY) { + scale *= exposure; + } + else if(type == DENOISING_PASS_PREFILTERED_VARIANCE) { + scale *= exposure*exposure * (sample - 1); + } - float scale_exposure = scale; - if(type == DENOISING_PASS_COLOR || type == DENOISING_PASS_CLEAN) { - scale_exposure *= exposure; + int offset; + if(type == DENOISING_PASS_CLEAN) { + /* The clean pass isn't changed by prefiltering, so we use the original one there. */ + offset = type + params.get_denoising_offset(); } - else if(type == DENOISING_PASS_COLOR_VAR) { - scale_exposure *= exposure*exposure; + else if (type == DENOISING_PASS_PREFILTERED_COLOR && !params.denoising_prefiltered_pass) { + /* If we're not saving the prefiltering result, return the original noisy pass. */ + offset = params.get_denoising_offset() + DENOISING_PASS_COLOR; + scale /= sample; + } + else { + offset = type + params.get_denoising_prefiltered_offset(); } - int offset = type + params.get_denoising_offset(); int pass_stride = params.get_passes_size(); int size = params.width*params.height; - if(variance) { - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - int mean_offset = offset - components; - float *mean = buffer.data() + mean_offset; - float *var = buffer.data() + offset; - assert(mean_offset >= 0); - - if(components == 1) { - for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels++) { - pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure; - } + float *in = buffer.data() + offset; + + if(components == 1) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + pixels[0] = in[0]*scale; } - else if(components == 3) { - for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels += 3) { - pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure; - pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale_exposure; - pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale_exposure; - } + } + else if(components == 3) { + for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { + pixels[0] = in[0]*scale; + pixels[1] = in[1]*scale; + pixels[2] = in[2]*scale; } - else { - return false; + } + else if(components == 4) { + /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */ + assert(params.passes[0].type == PASS_COMBINED); + float *in_combined = buffer.data(); + + for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) { + pixels[0] = in[0]*scale; + pixels[1] = in[1]*scale; + pixels[2] = in[2]*scale; + pixels[3] = saturate(in_combined[3]*alpha_scale); } } else { - float *in = buffer.data() + offset; - - if(components == 1) { - for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - pixels[0] = in[0]*scale_exposure; - } - } - else if(components == 3) { - for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - pixels[0] = in[0]*scale_exposure; - pixels[1] = in[1]*scale_exposure; - pixels[2] = in[2]*scale_exposure; - } - } - else if(components == 4) { - assert(type == DENOISING_PASS_COLOR); - - /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */ - assert(params.passes[0].type == PASS_COMBINED); - float *in_combined = buffer.data(); - - for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) { - pixels[0] = in[0]*scale_exposure; - pixels[1] = in[1]*scale_exposure; - pixels[2] = in[2]*scale_exposure; - pixels[3] = saturate(in_combined[3]*scale); - } - } - else { - return false; - } + return false; } return true; diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index 46c3b89bd84..0a010718d6d 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -54,6 +54,10 @@ public: bool denoising_data_pass; /* If only some light path types should be denoised, an additional pass is needed. */ bool denoising_clean_pass; + /* When we're prefiltering the passes during rendering, we need to keep both the + * original and the prefiltered data around because neighboring tiles might still + * need the original data. */ + bool denoising_prefiltered_pass; /* functions */ BufferParams(); @@ -63,6 +67,7 @@ public: void add_pass(PassType type); int get_passes_size(); int get_denoising_offset(); + int get_denoising_prefiltered_offset(); }; /* Render Buffers */ diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index d0f15496e50..b305fa59123 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -286,6 +286,7 @@ NODE_DEFINE(Film) SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false); SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false); + SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false); SOCKET_INT(denoising_flags, "Denoising Flags", 0); return type; @@ -469,6 +470,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN; kfilm->use_light_pass = 1; } + if(denoising_prefiltered_pass) { + kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED; + } } kfilm->pass_stride = align_up(kfilm->pass_stride, 4); diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index c597db4e4c5..8330a4cf413 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -60,6 +60,7 @@ public: vector passes; bool denoising_data_pass; bool denoising_clean_pass; + bool denoising_prefiltered_pass; int denoising_flags; float pass_alpha_threshold; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index c818f2b496c..3cee3b8bece 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -689,7 +689,7 @@ DeviceRequestedFeatures Session::get_requested_device_features() BakeManager *bake_manager = scene->bake_manager; requested_features.use_baking = bake_manager->get_baking(); requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH); - if(params.denoising_passes) { + if(params.run_denoising) { requested_features.use_denoising = true; requested_features.use_shadow_tricks = true; } @@ -927,7 +927,7 @@ void Session::update_status_time(bool show_pause, bool show_done) */ substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples); } - if(params.use_denoising) { + if(params.run_denoising) { substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles()); } } @@ -975,7 +975,7 @@ void Session::render() task.requested_tile_size = params.tile_size; task.passes_size = tile_manager.params.get_passes_size(); - if(params.use_denoising) { + if(params.run_denoising) { task.denoising_radius = params.denoising_radius; task.denoising_strength = params.denoising_strength; task.denoising_feature_strength = params.denoising_feature_strength; @@ -983,8 +983,13 @@ void Session::render() assert(!scene->film->need_update); task.pass_stride = scene->film->pass_stride; + task.target_pass_stride = task.pass_stride; task.pass_denoising_data = scene->film->denoising_data_offset; task.pass_denoising_clean = scene->film->denoising_clean_offset; + + task.denoising_from_render = true; + task.denoising_do_filter = params.full_denoising; + task.denoising_write_passes = params.write_denoising_passes; } device->task_add(task); diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index c7f590915e7..cb1d8fed68f 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -60,8 +60,9 @@ public: bool display_buffer_linear; - bool use_denoising; - bool denoising_passes; + bool run_denoising; + bool write_denoising_passes; + bool full_denoising; int denoising_radius; float denoising_strength; float denoising_feature_strength; @@ -94,8 +95,9 @@ public: use_profiling = false; - use_denoising = false; - denoising_passes = false; + run_denoising = false; + write_denoising_passes = false; + full_denoising = false; denoising_radius = 8; denoising_strength = 0.0f; denoising_feature_strength = 0.0f; -- cgit v1.2.3 From c183ac73dcfd20d0acf5ca07a2b062deadc4d73a Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Wed, 6 Feb 2019 14:42:32 +0100 Subject: Cycles: tweak outlier detection, preparing for animation denoising. Ref D3889. --- intern/cycles/kernel/filter/filter_prefilter.h | 43 ++++++++++++++++++++------ 1 file changed, 34 insertions(+), 9 deletions(-) (limited to 'intern/cycles') diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 41be4dbea49..e24f4feb28d 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -140,6 +140,7 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, int n = 0; float values[25]; + float pixel_variance, max_variance = 0.0f; for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { int idx = (y1-rect.y)*buffer_w + (x1-rect.x); @@ -159,15 +160,31 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, /* Insert L. */ values[i] = L; n++; + + float3 pixel_var = make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]); + float var = average(pixel_var); + if((x1 == x) && (y1 == y)) { + pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f)? -1.0f : var; + } + else { + max_variance = max(max_variance, var); + } } } + max_variance += 1e-4f; + int idx = (y-rect.y)*buffer_w + (x-rect.x); float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); color = max(color, make_float3(0.0f, 0.0f, 0.0f)); float L = average(color); float ref = 2.0f*values[(int)(n*0.75f)]; + + /* Slightly offset values to avoid false positives in (almost) black areas. */ + max_variance += 1e-5f; + ref -= 1e-5f; + if(L > ref) { /* The pixel appears to be an outlier. * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel @@ -175,16 +192,24 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. */ - float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); - if(L - 3*stddev < ref) { - /* The pixel is an outlier, so negate the depth value to mark it as one. - * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + + if(pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) { depth[idx] = -depth[idx]; - float fac = ref/L; - color *= fac; - variance[idx ] *= fac*fac; - variance[idx + pass_stride] *= fac*fac; - variance[idx+2*pass_stride] *= fac*fac; + color *= ref/L; + variance[idx] = variance[idx + pass_stride] = variance[idx + 2*pass_stride] = max_variance; + } + else { + float stddev = sqrtf(pixel_variance); + if(L - 3*stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + float fac = ref/L; + color *= fac; + variance[idx ] *= fac*fac; + variance[idx + pass_stride] *= fac*fac; + variance[idx+2*pass_stride] *= fac*fac; + } } } out[idx ] = color.x; -- cgit v1.2.3 From fccf506ed7fd96f8a8f5edda7b99f564a386321a Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Wed, 6 Feb 2019 14:19:20 +0100 Subject: Cycles: animation denoising support in the kernel. This is the internal implementation, not available from the API or interface yet. The algorithm takes into account past and future frames, both to get more coherent animation and reduce noise. Ref D3889. --- intern/cycles/device/device_cpu.cpp | 23 +++++-- intern/cycles/device/device_cuda.cpp | 21 ++++-- intern/cycles/device/device_denoising.cpp | 59 ++++++++++++++--- intern/cycles/device/device_denoising.h | 7 +- intern/cycles/device/device_task.h | 2 + intern/cycles/device/opencl/opencl.h | 1 + intern/cycles/device/opencl/opencl_base.cpp | 31 +++++++-- intern/cycles/kernel/filter/filter_defines.h | 6 +- intern/cycles/kernel/filter/filter_features.h | 77 ++++++++++++++-------- intern/cycles/kernel/filter/filter_features_sse.h | 53 ++++++++++----- intern/cycles/kernel/filter/filter_nlm_cpu.h | 13 ++-- intern/cycles/kernel/filter/filter_nlm_gpu.h | 11 +++- .../cycles/kernel/filter/filter_reconstruction.h | 12 ++-- intern/cycles/kernel/filter/filter_transform.h | 59 +++++++++-------- intern/cycles/kernel/filter/filter_transform_gpu.h | 54 ++++++++------- intern/cycles/kernel/filter/filter_transform_sse.h | 60 ++++++++++------- intern/cycles/kernel/kernels/cpu/filter_cpu.h | 9 ++- intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h | 19 +++++- intern/cycles/kernel/kernels/cuda/filter.cu | 25 +++++-- intern/cycles/kernel/kernels/opencl/filter.cl | 20 +++++- 20 files changed, 388 insertions(+), 174 deletions(-) (limited to 'intern/cycles') diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 6668acc9cbe..93c63b92a55 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -186,15 +186,15 @@ public: KernelFunctions filter_detect_outliers_kernel; KernelFunctions filter_combine_halves_kernel; - KernelFunctions filter_nlm_calc_difference_kernel; + KernelFunctions filter_nlm_calc_difference_kernel; KernelFunctions filter_nlm_blur_kernel; KernelFunctions filter_nlm_calc_weight_kernel; KernelFunctions filter_nlm_update_output_kernel; KernelFunctions filter_nlm_normalize_kernel; - KernelFunctions filter_construct_transform_kernel; - KernelFunctions filter_nlm_construct_gramian_kernel; - KernelFunctions filter_finalize_kernel; + KernelFunctions filter_construct_transform_kernel; + KernelFunctions filter_nlm_construct_gramian_kernel; + KernelFunctions filter_finalize_kernel; KernelFunctionsfilter_area.w; y++) { for(int x = 0; x < task->filter_area.z; x++) { filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + task->tile_info, x + task->filter_area.x, y + task->filter_area.y, y*task->filter_area.z + x, @@ -549,6 +550,8 @@ public: (int*) task->storage.rank.device_pointer, &task->rect.x, task->buffer.pass_stride, + task->buffer.frame_stride, + task->buffer.use_time, task->radius, task->pca_threshold); } @@ -559,6 +562,7 @@ public: bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task) { ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); @@ -568,6 +572,7 @@ public: float *blurDifference = temporary_mem + task->buffer.pass_stride; int r = task->radius; + int frame_offset = frame * task->buffer.frame_stride; for(int i = 0; i < (2*r+1)*(2*r+1); i++) { int dy = i / (2*r+1) - r; int dx = i % (2*r+1) - r; @@ -583,12 +588,14 @@ public: local_rect, task->buffer.stride, task->buffer.pass_stride, + frame_offset, 1.0f, task->nlm_k_2); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_construct_gramian_kernel()(dx, dy, + task->tile_info->frames[frame], blurDifference, (float*) task->buffer.mem.device_pointer, (float*) task->storage.transform.device_pointer, @@ -599,7 +606,9 @@ public: &task->reconstruction_state.filter_window.x, task->buffer.stride, 4, - task->buffer.pass_stride); + task->buffer.pass_stride, + frame_offset, + task->buffer.use_time); } return true; @@ -787,7 +796,7 @@ public: tile.sample = tile.start_sample + tile.num_samples; denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index cb7d8bbb224..e21d974ebbe 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1301,6 +1301,7 @@ public: int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; + int frame_offset = 0; if(have_error()) return false; @@ -1327,7 +1328,7 @@ public: CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); - void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2}; + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f}; @@ -1367,13 +1368,16 @@ public: task->storage.h); void *args[] = {&task->buffer.mem.device_pointer, + &task->tile_info_mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, &task->filter_area, &task->rect, &task->radius, &task->pca_threshold, - &task->buffer.pass_stride}; + &task->buffer.pass_stride, + &task->buffer.frame_stride, + &task->buffer.use_time}; CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); cuda_assert(cuCtxSynchronize()); @@ -1383,6 +1387,7 @@ public: bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task) { if(have_error()) @@ -1398,6 +1403,8 @@ public: int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); @@ -1430,10 +1437,12 @@ public: &w, &h, &stride, &pass_stride, &r, &pass_stride, + &frame_offset, &a, &k_2}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&blurDifference, + void *construct_gramian_args[] = {&t, + &blurDifference, &task->buffer.mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, @@ -1442,7 +1451,9 @@ public: &task->reconstruction_state.filter_window, &w, &h, &stride, &pass_stride, &r, - &f}; + &f, + &frame_offset, + &task->buffer.use_time}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1635,7 +1646,7 @@ public: void denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 724171c3acb..61e0ba47ab8 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -36,6 +36,7 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); } + render_buffer.frame_stride = task.frame_stride; render_buffer.pass_stride = task.pass_stride; render_buffer.offset = task.pass_denoising_data; @@ -49,6 +50,12 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); tile_info->from_render = task.denoising_from_render? 1 : 0; + tile_info->frames[0] = 0; + tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES); + for(int i = 1; i < tile_info->num_frames; i++) { + tile_info->frames[i] = task.denoising_frames[i-1]; + } + write_passes = task.denoising_write_passes; do_filter = task.denoising_do_filter; } @@ -101,16 +108,18 @@ void DenoisingTask::setup_denoising_buffer() rect = rect_expand(rect, radius); rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - buffer.use_intensity = write_passes; + buffer.use_intensity = write_passes || (tile_info->num_frames > 1); buffer.passes = buffer.use_intensity? 15 : 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); + buffer.frame_stride = buffer.pass_stride * buffer.passes; /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ - int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats); + int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats); buffer.mem.alloc_to_device(mem_size, false); + buffer.use_time = (tile_info->num_frames > 1); /* CPUs process shifts sequentially while GPUs process them in parallel. */ int num_layers; @@ -216,6 +225,25 @@ void DenoisingTask::prefilter_color() } } +void DenoisingTask::load_buffer() +{ + device_ptr null_ptr = (device_ptr) 0; + + int original_offset = render_buffer.offset; + + int num_passes = buffer.use_intensity? 15 : 14; + for(int i = 0; i < tile_info->num_frames; i++) { + for(int pass = 0; pass < num_passes; pass++) { + device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride); + bool is_variance = (pass >= 11) && (pass <= 13); + functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f); + } + render_buffer.offset += render_buffer.frame_stride; + } + + render_buffer.offset = original_offset; +} + void DenoisingTask::write_buffer() { reconstruction_state.buffer_params = make_int4(target_buffer.offset, @@ -259,11 +287,17 @@ void DenoisingTask::reconstruct() device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); - - device_ptr scale_ptr = 0; - device_sub_ptr *scale_sub_ptr = NULL; - functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr); - delete scale_sub_ptr; + for(int f = 0; f < tile_info->num_frames; f++) { + device_ptr scale_ptr = 0; + device_sub_ptr *scale_sub_ptr = NULL; + if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) { + scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); + scale_ptr = **scale_sub_ptr; + } + + functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f); + delete scale_sub_ptr; + } functions.solve(target_buffer.ptr); } @@ -276,9 +310,14 @@ void DenoisingTask::run_denoising(RenderTile *tile) setup_denoising_buffer(); - prefilter_shadowing(); - prefilter_features(); - prefilter_color(); + if(tile_info->from_render) { + prefilter_shadowing(); + prefilter_features(); + prefilter_color(); + } + else { + load_buffer(); + } if(do_filter) { construct_transform(); diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index cddcd3bd0c9..5869aa05390 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -38,6 +38,7 @@ public: struct RenderBuffers { int offset; int pass_stride; + int frame_stride; int samples; } render_buffer; @@ -70,7 +71,8 @@ public: )> non_local_means; function accumulate; function solve; function construct_transform; @@ -156,8 +158,10 @@ public: int stride; int h; int width; + int frame_stride; device_only_memory mem; device_only_memory temporary_mem; + bool use_time; bool use_intensity; bool gpu_temporary_mem; @@ -179,6 +183,7 @@ protected: void construct_transform(); void reconstruct(); + void load_buffer(); void write_buffer(); }; diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 97bcde99af6..2871bc5761a 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -73,11 +73,13 @@ public: float denoising_feature_strength; bool denoising_relative_pca; bool denoising_from_render; + vector denoising_frames; bool denoising_do_filter; bool denoising_write_passes; int pass_stride; + int frame_stride; int target_pass_stride; int pass_denoising_data; int pass_denoising_clean; diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 4d42ddc0c53..9b763167459 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -422,6 +422,7 @@ protected: bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task); bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index a0a1cf68c32..4417065bb7f 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -821,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + char use_time = task->buffer.use_time? 1 : 0; cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - kernel_set_args(ckFilterConstructTransform, 0, - buffer_mem, + int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterConstructTransform, + arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterConstructTransform, + arg_ofs, transform_mem, rank_mem, task->filter_area, task->rect, task->buffer.pass_stride, + task->buffer.frame_stride, + use_time, task->radius, task->pca_threshold); @@ -845,6 +860,7 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task) { cl_mem color_mem = CL_MEM_PTR(color_ptr); @@ -865,6 +881,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + char use_time = task->buffer.use_time? 1 : 0; int r = task->radius; int pass_stride = task->buffer.pass_stride; @@ -884,6 +903,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, pass_stride, r, pass_stride, + frame_offset, 1.0f, task->nlm_k_2); kernel_set_args(ckNLMBlur, 0, difference_mem, @@ -898,6 +918,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, pass_stride, r, 4); kernel_set_args(ckNLMConstructGramian, 0, + t, blurDifference_mem, buffer_mem, transform_mem, @@ -907,7 +928,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, task->reconstruction_state.filter_window, w, h, stride, pass_stride, - r, 4); + r, 4, + frame_offset, + use_time); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); @@ -1108,7 +1131,7 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h index 9ac7c3db23d..cb04aac35f4 100644 --- a/intern/cycles/kernel/filter/filter_defines.h +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -17,17 +17,21 @@ #ifndef __FILTER_DEFINES_H__ #define __FILTER_DEFINES_H__ -#define DENOISE_FEATURES 10 +#define DENOISE_FEATURES 11 #define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) #define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) #define XTWY_SIZE (DENOISE_FEATURES+1) +#define DENOISE_MAX_FRAMES 16 + typedef struct TileInfo { int offsets[9]; int strides[9]; int x[4]; int y[4]; int from_render; + int frames[DENOISE_MAX_FRAMES]; + int num_frames; /* TODO(lukas): CUDA doesn't have uint64_t... */ #ifdef __KERNEL_OPENCL__ ccl_global float *buffers[9]; diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h index 6226ed2c2ef..e1ea6487aa9 100644 --- a/intern/cycles/kernel/filter/filter_features.h +++ b/intern/cycles/kernel/filter/filter_features.h @@ -18,19 +18,23 @@ #define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] -/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). - * pixel_buffer always points to the current pixel in the first pass. */ -#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ - for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass. + * Repeat the loop for every secondary frame if there are any. */ +#define FOR_PIXEL_WINDOW for(int frame = 0; frame < tile_info->num_frames; frame++) { \ + pixel.z = tile_info->frames[frame]; \ + pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { -#define END_FOR_PIXEL_WINDOW } \ - pixel_buffer += buffer_w - (high.x - low.x); \ +#define END_FOR_PIXEL_WINDOW } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } \ } -ccl_device_inline void filter_get_features(int2 pixel, +ccl_device_inline void filter_get_features(int3 pixel, const ccl_global float *ccl_restrict buffer, float *features, + bool use_time, const float *ccl_restrict mean, int pass_stride) { @@ -44,15 +48,20 @@ ccl_device_inline void filter_get_features(int2 pixel, features[7] = ccl_get_feature(buffer, 5); features[8] = ccl_get_feature(buffer, 6); features[9] = ccl_get_feature(buffer, 7); + if(use_time) { + features[10] = pixel.z; + } if(mean) { - for(int i = 0; i < DENOISE_FEATURES; i++) + for(int i = 0; i < (use_time? 11 : 10); i++) { features[i] -= mean[i]; + } } } -ccl_device_inline void filter_get_feature_scales(int2 pixel, +ccl_device_inline void filter_get_feature_scales(int3 pixel, const ccl_global float *ccl_restrict buffer, float *scales, + bool use_time, const float *ccl_restrict mean, int pass_stride) { @@ -66,13 +75,19 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel, scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], ccl_get_feature(buffer, 6) - mean[8], ccl_get_feature(buffer, 7) - mean[9])); + if(use_time) { + scales[6] = fabsf(pixel.z - mean[10]); + } } -ccl_device_inline void filter_calculate_scale(float *scale) +ccl_device_inline void filter_calculate_scale(float *scale, bool use_time) { scale[0] = 1.0f/max(scale[0], 0.01f); scale[1] = 1.0f/max(scale[1], 0.01f); scale[2] = 1.0f/max(scale[2], 0.01f); + if(use_time) { + scale[10] = 1.0f/max(scale[6], 0.01f); + } scale[6] = 1.0f/max(scale[4], 0.01f); scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); @@ -89,36 +104,46 @@ ccl_device_inline void design_row_add(float *design_row, const ccl_global float *ccl_restrict transform, int stride, int row, - float feature) + float feature, + int transform_row_stride) { for(int i = 0; i < rank; i++) { - design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature; } } /* Fill the design row. */ -ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, +ccl_device_inline void filter_get_design_row_transform(int3 p_pixel, const ccl_global float *ccl_restrict p_buffer, - int2 q_pixel, + int3 q_pixel, const ccl_global float *ccl_restrict q_buffer, int pass_stride, int rank, float *design_row, const ccl_global float *ccl_restrict transform, - int stride) + int stride, + bool use_time) { + int num_features = use_time? 11 : 10; + design_row[0] = 1.0f; math_vector_zero(design_row+1, rank); - design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); - design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); - design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); - design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); - design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); - design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); - design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); - design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); - design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); - design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); + +#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features); + DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x); + DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y); + DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); + DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); + if(use_time) { + DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z) + } +#undef DESIGN_ROW_ADD } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 3ddd8712266..5dd001ffb93 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -20,26 +20,33 @@ CCL_NAMESPACE_BEGIN /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. * pixel_buffer always points to the first of the 4 current pixel in the first pass. - * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. + * Repeat the loop for every secondary frame if there are any. */ +#define FOR_PIXEL_WINDOW_SSE for(int frame = 0; frame < tile_info->num_frames; frame++) { \ + pixel.z = tile_info->frames[frame]; \ + pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \ + float4 t4 = make_float4(pixel.z); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + float4 y4 = make_float4(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); -#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ - for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - float4 y4 = make_float4(pixel.y); \ - for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ - int4 active_pixels = x4 < make_float4(high.x); - -#define END_FOR_PIXEL_WINDOW_SSE } \ - pixel_buffer += buffer_w - (pixel.x - low.x); \ +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } \ } -ccl_device_inline void filter_get_features_sse(float4 x, float4 y, +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *features, + bool use_time, const float4 *ccl_restrict mean, int pass_stride) { + int num_features = use_time? 11 : 10; + features[0] = x; features[1] = y; features[2] = fabs(ccl_get_feature_sse(0)); @@ -50,18 +57,25 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y, features[7] = ccl_get_feature_sse(5); features[8] = ccl_get_feature_sse(6); features[9] = ccl_get_feature_sse(7); + if(use_time) { + features[10] = t; + } + if(mean) { - for(int i = 0; i < DENOISE_FEATURES; i++) + for(int i = 0; i < num_features; i++) { features[i] = features[i] - mean[i]; + } } - for(int i = 0; i < DENOISE_FEATURES; i++) + for(int i = 0; i < num_features; i++) { features[i] = mask(active_pixels, features[i]); + } } -ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *scales, + bool use_time, const float4 *ccl_restrict mean, int pass_stride) { @@ -75,15 +89,22 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) + sqr(ccl_get_feature_sse(7) - mean[9]); - for(int i = 0; i < 6; i++) + if(use_time) { + scales[6] = fabs(t - mean[10]); + } + + for(int i = 0; i < (use_time? 7 : 6); i++) scales[i] = mask(active_pixels, scales[i]); } -ccl_device_inline void filter_calculate_scale_sse(float4 *scale) +ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time) { scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + if(use_time) { + scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));; + } scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 0c4387af540..9eb3c603a4a 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -27,6 +27,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, int4 rect, int stride, int channel_offset, + int frame_offset, float a, float k_2) { @@ -39,7 +40,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, for(int y = rect.y; y < rect.w; y++) { int idx_p = y*stride + aligned_lowx; - int idx_q = (y+dy)*stride + aligned_lowx + dx; + int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset; for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { float4 diff = make_float4(0.0f); float4 scale_fac; @@ -181,7 +182,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, } } -ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t, const float *ccl_restrict difference_image, const float *ccl_restrict buffer, float *transform, @@ -191,7 +192,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int4 rect, int4 filter_window, int stride, int f, - int pass_stride) + int pass_stride, + int frame_offset, + bool use_time) { int4 clip_area = rect_clip(rect, filter_window); /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ @@ -212,9 +215,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int *l_rank = rank + storage_ofs; kernel_filter_construct_gramian(x, y, 1, - dx, dy, + dx, dy, t, stride, pass_stride, + frame_offset, + use_time, buffer, l_transform, l_rank, weight, l_XtWX, l_XtWY, 0); diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index d8e2e4d08aa..12636393243 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -82,9 +82,10 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, ccl_global float *difference_image, int4 rect, int stride, int channel_offset, + int frame_offset, float a, float k_2) { - int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset; int numChannels = channel_offset? 3 : 1; float diff = 0.0f; @@ -170,7 +171,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, } ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, - int dx, int dy, + int dx, int dy, int t, const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, @@ -181,6 +182,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, int4 filter_window, int stride, int f, int pass_stride, + int frame_offset, + bool use_time, int localIdx) { const int low = max(rect.x, x-f); @@ -201,9 +204,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, kernel_filter_construct_gramian(x, y, rect_size(filter_window), - dx, dy, + dx, dy, t, stride, pass_stride, + frame_offset, + use_time, buffer, transform, rank, weight, XtWX, XtWY, diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index e5d3b0da835..31a7487c77a 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -18,9 +18,11 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int storage_stride, - int dx, int dy, + int dx, int dy, int t, int buffer_stride, int pass_stride, + int frame_offset, + bool use_time, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, ccl_global int *rank, @@ -34,7 +36,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, } int p_offset = y * buffer_stride + x; - int q_offset = (y+dy) * buffer_stride + (x+dx); + int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset; #ifdef __KERNEL_GPU__ const int stride = storage_stride; @@ -57,9 +59,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, return; } - filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, - make_int2(x+dx, y+dy), buffer + q_offset, - pass_stride, *rank, design_row, transform, stride); + filter_get_design_row_transform(make_int3(x, y, t), buffer + p_offset, + make_int3(x+dx, y+dy, t), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride, use_time); #ifdef __KERNEL_GPU__ math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h index a5f87c05ec0..94e27bb02fd 100644 --- a/intern/cycles/kernel/filter/filter_transform.h +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -17,8 +17,10 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, int x, int y, int4 rect, - int pass_stride, + int pass_stride, int frame_stride, + bool use_time, float *transform, int *rank, int radius, float pca_threshold) { @@ -26,59 +28,58 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff float features[DENOISE_FEATURES]; - /* Temporary storage, used in different steps of the algorithm. */ - float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; - float tempvector[2*DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; - int2 pixel; + int3 pixel; + + int num_features = use_time? 11 : 10; /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; /* === Shift feature passes to have mean 0. === */ float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, DENOISE_FEATURES); + math_vector_zero(feature_means, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); - math_vector_add(feature_means, features, DENOISE_FEATURES); + filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add(feature_means, features, num_features); } END_FOR_PIXEL_WINDOW - math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + math_vector_scale(feature_means, 1.0f / num_pixels, num_features); /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ - float *feature_scale = tempvector; - math_vector_zero(feature_scale, DENOISE_FEATURES); + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, num_features); FOR_PIXEL_WINDOW { - filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_max(feature_scale, features, DENOISE_FEATURES); + filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max(feature_scale, features, num_features); } END_FOR_PIXEL_WINDOW - filter_calculate_scale(feature_scale); + filter_calculate_scale(feature_scale, use_time); /* === Generate the feature transformation. === - * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ - float* feature_matrix = tempmatrix; - math_matrix_zero(feature_matrix, DENOISE_FEATURES); + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_mul(features, feature_scale, DENOISE_FEATURES); - math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul(features, feature_scale, num_features); + math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); } END_FOR_PIXEL_WINDOW - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); *rank = 0; /* Prevent overfitting when a small window is used. */ - int max_rank = min(DENOISE_FEATURES, num_pixels/3); + int max_rank = min(num_features, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++) { - threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + for(int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i*num_features+i]; } threshold_energy *= 1.0f - (-pca_threshold); @@ -86,13 +87,13 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; reduced_energy += s; } } else { for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; } @@ -100,9 +101,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff /* Bake the feature scaling into the transformation matrix. */ for(int i = 0; i < (*rank); i++) { - math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); + math_vector_mul(transform + i*num_features, feature_scale, num_features); } - math_matrix_transpose(transform, DENOISE_FEATURES, 1); + math_matrix_transpose(transform, num_features, 1); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h index 83a1222bbdb..ed8ddcb49b1 100644 --- a/intern/cycles/kernel/filter/filter_transform_gpu.h +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -17,8 +17,10 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, int x, int y, int4 rect, - int pass_stride, + int pass_stride, int frame_stride, + bool use_time, ccl_global float *transform, ccl_global int *rank, int radius, float pca_threshold, @@ -33,60 +35,62 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re float features[DENOISE_FEATURES]; #endif + int num_features = use_time? 11 : 10; + /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; const ccl_global float *ccl_restrict pixel_buffer; - int2 pixel; + int3 pixel; /* === Shift feature passes to have mean 0. === */ float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, DENOISE_FEATURES); + math_vector_zero(feature_means, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); - math_vector_add(feature_means, features, DENOISE_FEATURES); + filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add(feature_means, features, num_features); } END_FOR_PIXEL_WINDOW - math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + math_vector_scale(feature_means, 1.0f / num_pixels, num_features); /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ float feature_scale[DENOISE_FEATURES]; - math_vector_zero(feature_scale, DENOISE_FEATURES); + math_vector_zero(feature_scale, num_features); FOR_PIXEL_WINDOW { - filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_max(feature_scale, features, DENOISE_FEATURES); + filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max(feature_scale, features, num_features); } END_FOR_PIXEL_WINDOW - filter_calculate_scale(feature_scale); + filter_calculate_scale(feature_scale, use_time); /* === Generate the feature transformation. === - * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_zero(feature_matrix, DENOISE_FEATURES); + math_matrix_zero(feature_matrix, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_mul(features, feature_scale, DENOISE_FEATURES); - math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul(features, feature_scale, num_features); + math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); } END_FOR_PIXEL_WINDOW - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride); *rank = 0; /* Prevent overfitting when a small window is used. */ - int max_rank = min(DENOISE_FEATURES, num_pixels/3); + int max_rank = min(num_features, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++) { - threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + for(int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i*num_features+i]; } threshold_energy *= 1.0f - (-pca_threshold); @@ -94,24 +98,24 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; reduced_energy += s; } } else { for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; } } - math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride); + math_matrix_transpose(transform, num_features, transform_stride); /* Bake the feature scaling into the transformation matrix. */ - for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int i = 0; i < num_features; i++) { for(int j = 0; j < (*rank); j++) { - transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i]; + transform[(i*num_features + j)*transform_stride] *= feature_scale[i]; } } } diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 9e65f61664b..10bd3e477e9 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -17,8 +17,10 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, int x, int y, int4 rect, - int pass_stride, + int pass_stride, int frame_stride, + bool use_time, float *transform, int *rank, int radius, float pca_threshold) { @@ -26,55 +28,63 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff float4 features[DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; - int2 pixel; + int3 pixel; + int num_features = use_time? 11 : 10; + + /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; + /* === Shift feature passes to have mean 0. === */ float4 feature_means[DENOISE_FEATURES]; - math_vector_zero_sse(feature_means, DENOISE_FEATURES); + math_vector_zero_sse(feature_means, num_features); FOR_PIXEL_WINDOW_SSE { - filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); - math_vector_add_sse(feature_means, DENOISE_FEATURES, features); + filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add_sse(feature_means, num_features, features); } END_FOR_PIXEL_WINDOW_SSE float4 pixel_scale = make_float4(1.0f / num_pixels); - for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int i = 0; i < num_features; i++) { feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; } + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ float4 feature_scale[DENOISE_FEATURES]; - math_vector_zero_sse(feature_scale, DENOISE_FEATURES); + math_vector_zero_sse(feature_scale, num_features); FOR_PIXEL_WINDOW_SSE { - filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); - math_vector_max_sse(feature_scale, features, DENOISE_FEATURES); + filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, num_features); } END_FOR_PIXEL_WINDOW_SSE - filter_calculate_scale_sse(feature_scale); + filter_calculate_scale_sse(feature_scale, use_time); + /* === Generate the feature transformation. === + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); + math_matrix_zero_sse(feature_matrix_sse, num_features); FOR_PIXEL_WINDOW_SSE { - filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); - math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); + filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul_sse(features, num_features, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f)); } END_FOR_PIXEL_WINDOW_SSE float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse); + math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse); - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); *rank = 0; /* Prevent overfitting when a small window is used. */ - int max_rank = min(DENOISE_FEATURES, num_pixels/3); + int max_rank = min(num_features, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++) { - threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + for(int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i*num_features+i]; } threshold_energy *= 1.0f - (-pca_threshold); @@ -82,23 +92,23 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; reduced_energy += s; } } else { for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; } } - math_matrix_transpose(transform, DENOISE_FEATURES, 1); + math_matrix_transpose(transform, num_features, 1); /* Bake the feature scaling into the transformation matrix. */ - for(int i = 0; i < DENOISE_FEATURES; i++) { - math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); + for(int i = 0; i < num_features; i++) { + math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank); } } diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index 08333c7a455..02c85562db8 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -68,6 +68,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, int r); void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + TileInfo *tiles, int x, int y, int storage_ofs, @@ -75,6 +76,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, int *rank, int* rect, int pass_stride, + int frame_stride, + bool use_time, int radius, float pca_threshold); @@ -87,6 +90,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int* rect, int stride, int channel_offset, + int frame_offset, float a, float k_2); @@ -116,6 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, + int t, float *difference_image, float *buffer, float *transform, @@ -126,7 +131,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int *filter_window, int stride, int f, - int pass_stride); + int pass_stride, + int frame_offset, + bool use_time); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, float *accum_image, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index b792367e3ab..c29505880cb 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -135,6 +135,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, } void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + TileInfo *tile_info, int x, int y, int storage_ofs, @@ -142,6 +143,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, int *rank, int* prefilter_rect, int pass_stride, + int frame_stride, + bool use_time, int radius, float pca_threshold) { @@ -151,9 +154,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, rank += storage_ofs; transform += storage_ofs*TRANSFORM_SIZE; kernel_filter_construct_transform(buffer, + tile_info, x, y, load_int4(prefilter_rect), pass_stride, + frame_stride, + use_time, transform, rank, radius, @@ -170,6 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int *rect, int stride, int channel_offset, + int frame_offset, float a, float k_2) { @@ -184,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, load_int4(rect), stride, channel_offset, + frame_offset, a, k_2); #endif } @@ -243,6 +251,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, + int t, float *difference_image, float *buffer, float *transform, @@ -253,12 +262,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int *filter_window, int stride, int f, - int pass_stride) + int pass_stride, + int frame_offset, + bool use_time) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); #else - kernel_filter_nlm_construct_gramian(dx, dy, + kernel_filter_nlm_construct_gramian(dx, dy, t, difference_image, buffer, transform, rank, @@ -266,7 +277,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, load_int4(rect), load_int4(filter_window), stride, f, - pass_stride); + pass_stride, + frame_offset, + use_time); #endif } diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 3b51bb41aed..5b552b01413 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -29,7 +29,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_divide_shadow(int sample, - TileInfo *tile_info, + CCL_FILTER_TILE_INFO, float *unfilteredA, float *unfilteredB, float *sampleVariance, @@ -59,7 +59,7 @@ kernel_cuda_filter_divide_shadow(int sample, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_get_feature(int sample, - TileInfo *tile_info, + CCL_FILTER_TILE_INFO, int m_offset, int v_offset, float *mean, @@ -138,10 +138,12 @@ kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, + CCL_FILTER_TILE_INFO, float *transform, int *rank, int4 filter_area, int4 rect, int radius, float pca_threshold, - int pass_stride) + int pass_stride, int frame_stride, + bool use_time) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; @@ -149,8 +151,11 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, int *l_rank = rank + y*filter_area.z + x; float *l_transform = transform + y*filter_area.z + x; kernel_filter_construct_transform(buffer, + tile_info, x + filter_area.x, y + filter_area.y, - rect, pass_stride, + rect, + pass_stride, frame_stride, + use_time, l_transform, l_rank, radius, pca_threshold, filter_area.z*filter_area.w, @@ -170,6 +175,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, int pass_stride, int r, int channel_offset, + int frame_offset, float a, float k_2) { @@ -183,6 +189,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, difference_image + ofs, rect, stride, channel_offset, + frame_offset, a, k_2); } } @@ -274,7 +281,8 @@ kernel_cuda_filter_nlm_normalize(float *out_image, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image, +kernel_cuda_filter_nlm_construct_gramian(int t, + const float *ccl_restrict difference_image, const float *ccl_restrict buffer, float const* __restrict__ transform, int *rank, @@ -286,13 +294,16 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im int stride, int pass_stride, int r, - int f) + int f, + int frame_offset, + bool use_time) { int4 co, rect; int ofs; if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, + t, difference_image + ofs, buffer, transform, rank, @@ -300,6 +311,8 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im rect, filter_window, stride, f, pass_stride, + frame_offset, + use_time, threadIdx.y*blockDim.x + threadIdx.x); } } diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index 8a821ee281d..996bc27f71b 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -127,11 +127,14 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, } __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, ccl_global float *transform, ccl_global int *rank, int4 filter_area, int4 rect, int pass_stride, + int frame_stride, + char use_time, int radius, float pca_threshold) { @@ -141,8 +144,11 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_ ccl_global int *l_rank = rank + y*filter_area.z + x; ccl_global float *l_transform = transform + y*filter_area.z + x; kernel_filter_construct_transform(buffer, + CCL_FILTER_TILE_INFO_ARG, x + filter_area.x, y + filter_area.y, - rect, pass_stride, + rect, + pass_stride, frame_stride, + use_time, l_transform, l_rank, radius, pca_threshold, filter_area.z*filter_area.w, @@ -160,6 +166,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ int pass_stride, int r, int channel_offset, + int frame_offset, float a, float k_2) { @@ -173,6 +180,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ difference_image + ofs, rect, stride, channel_offset, + frame_offset, a, k_2); } } @@ -254,7 +262,8 @@ __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, } } -__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image, +__kernel void kernel_ocl_filter_nlm_construct_gramian(int t, + const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, ccl_global int *rank, @@ -266,13 +275,16 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc int stride, int pass_stride, int r, - int f) + int f, + int frame_offset, + char use_time) { int4 co, rect; int ofs; if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, + t, difference_image + ofs, buffer, transform, rank, @@ -280,6 +292,8 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc rect, filter_window, stride, f, pass_stride, + frame_offset, + use_time, get_local_id(1)*get_local_size(0) + get_local_id(0)); } } -- cgit v1.2.3