diff options
author | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2019-02-06 17:22:53 +0300 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2019-02-06 17:22:53 +0300 |
commit | e21ae0bb267a54482108ddd4feed99c89241804b (patch) | |
tree | 5d5220c578c0e41533a2a4430018ced6ff13e08c | |
parent | e8292466bcb69282798bba5dd701fff514cb0b78 (diff) | |
parent | fccf506ed7fd96f8a8f5edda7b99f564a386321a (diff) |
Merge branch 'blender2.7'
31 files changed, 1049 insertions, 419 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 7829e090c98..b8bc74f9e35 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -270,14 +270,11 @@ def register_passes(engine, scene, srl): engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR') if crl.denoising_store_passes: engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR') engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR') - engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR') engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE') - engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE') - engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Shadowing", 1, "X", 'VALUE') + engine.register_pass(scene, srl, "Denoising Variance", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Intensity", 1, "X", 'VALUE') clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect", "denoising_glossy_direct", "denoising_glossy_indirect", "denoising_transmission_direct", "denoising_transmission_indirect", diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 10170c9d0ba..6e6d98b19dd 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -423,15 +423,19 @@ void BlenderSession::render(BL::Depsgraph& b_depsgraph_) buffer_params.passes = passes; PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); - bool use_denoising = get_boolean(crl, "use_denoising"); - bool denoising_passes = use_denoising || get_boolean(crl, "denoising_store_passes"); + bool full_denoising = get_boolean(crl, "use_denoising"); + bool write_denoising_passes = get_boolean(crl, "denoising_store_passes"); - session->tile_manager.schedule_denoising = use_denoising; - buffer_params.denoising_data_pass = denoising_passes; + bool run_denoising = full_denoising || write_denoising_passes; + + session->tile_manager.schedule_denoising = run_denoising; + buffer_params.denoising_data_pass = run_denoising; buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES); + buffer_params.denoising_prefiltered_pass = write_denoising_passes; - session->params.use_denoising = use_denoising; - session->params.denoising_passes = denoising_passes; + session->params.run_denoising = run_denoising; + session->params.full_denoising = full_denoising; + session->params.write_denoising_passes = write_denoising_passes; session->params.denoising_radius = get_int(crl, "denoising_radius"); session->params.denoising_strength = get_float(crl, "denoising_strength"); session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength"); @@ -439,6 +443,7 @@ void BlenderSession::render(BL::Depsgraph& b_depsgraph_) scene->film->denoising_data_pass = buffer_params.denoising_data_pass; scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass; + scene->film->denoising_prefiltered_pass = buffer_params.denoising_prefiltered_pass; session->params.denoising_radius = get_int(crl, "denoising_radius"); session->params.denoising_strength = get_float(crl, "denoising_strength"); session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength"); diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index e41a80a14a5..072af281a73 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -482,7 +482,7 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) { string name = b_pass.name(); - if(name == "Noisy Image") return DENOISING_PASS_COLOR; + if(name == "Noisy Image") return DENOISING_PASS_PREFILTERED_COLOR; if(name.substr(0, 10) != "Denoising ") { return -1; @@ -490,15 +490,12 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) name = name.substr(10); #define MAP_PASS(passname, offset) if(name == passname) return offset; - MAP_PASS("Normal", DENOISING_PASS_NORMAL); - MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR); - MAP_PASS("Albedo", DENOISING_PASS_ALBEDO); - MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR); - MAP_PASS("Depth", DENOISING_PASS_DEPTH); - MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR); - MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A); - MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B); - MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR); + MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL); + MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO); + MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH); + MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING); + MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE); + MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY); MAP_PASS("Clean", DENOISING_PASS_CLEAN); #undef MAP_PASS @@ -530,10 +527,11 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, } PointerRNA crp = RNA_pointer_get(&b_view_layer.ptr, "cycles"); - bool use_denoising = get_boolean(crp, "use_denoising"); - bool store_denoising_passes = get_boolean(crp, "denoising_store_passes"); + bool full_denoising = get_boolean(crp, "use_denoising"); + bool write_denoising_passes = get_boolean(crp, "denoising_store_passes"); + scene->film->denoising_flags = 0; - if(use_denoising || store_denoising_passes) { + if(full_denoising || write_denoising_passes) { #define MAP_OPTION(name, flag) if(!get_boolean(crp, name)) scene->film->denoising_flags |= flag; MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR); MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND); @@ -547,16 +545,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str()); } - if(store_denoising_passes) { + if(write_denoising_passes) { b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_view_layer.name().c_str()); b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_view_layer.name().c_str()); b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_view_layer.name().c_str()); + b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str()); + b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str()); + b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str()); if(scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) { b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str()); diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index a92c052a5df..1f39a412083 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -180,20 +180,21 @@ public: KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel; - KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel; - KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel; + KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)> filter_write_feature_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; - KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel; + KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; - KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; - KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, int, int, int, int, int, int, int, int, ccl_global int*, int, @@ -218,6 +219,7 @@ public: REGISTER_KERNEL(shader), REGISTER_KERNEL(filter_divide_shadow), REGISTER_KERNEL(filter_get_feature), + REGISTER_KERNEL(filter_write_feature), REGISTER_KERNEL(filter_detect_outliers), REGISTER_KERNEL(filter_combine_halves), REGISTER_KERNEL(filter_nlm_calc_difference), @@ -487,6 +489,8 @@ public: int w = align_up(rect.z-rect.x, 4); int h = rect.w-rect.y; + int stride = task->buffer.stride; + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer; float *blurDifference = temporary_mem; @@ -504,10 +508,11 @@ public: filter_nlm_calc_difference_kernel()(dx, dy, (float*) guide_ptr, (float*) variance_ptr, + NULL, difference, local_rect, - w, 0, - a, k_2); + w, channel_offset, + 0, a, k_2); filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); @@ -520,7 +525,8 @@ public: (float*) out_ptr, weightAccum, local_rect, - w, f); + channel_offset, + stride, f); } int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; @@ -536,6 +542,7 @@ public: for(int y = 0; y < task->filter_area.w; y++) { for(int x = 0; x < task->filter_area.z; x++) { filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + task->tile_info, x + task->filter_area.x, y + task->filter_area.y, y*task->filter_area.z + x, @@ -543,6 +550,8 @@ public: (int*) task->storage.rank.device_pointer, &task->rect.x, task->buffer.pass_stride, + task->buffer.frame_stride, + task->buffer.use_time, task->radius, task->pca_threshold); } @@ -550,21 +559,20 @@ public: return true; } - bool denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task) + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) { ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); - mem_zero(task->storage.XtWX); - mem_zero(task->storage.XtWY); - float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer; float *difference = temporary_mem; float *blurDifference = temporary_mem + task->buffer.pass_stride; int r = task->radius; + int frame_offset = frame * task->buffer.frame_stride; for(int i = 0; i < (2*r+1)*(2*r+1); i++) { int dy = i / (2*r+1) - r; int dx = i % (2*r+1) - r; @@ -575,16 +583,19 @@ public: filter_nlm_calc_difference_kernel()(dx, dy, (float*) color_ptr, (float*) color_variance_ptr, + (float*) scale_ptr, difference, local_rect, task->buffer.stride, task->buffer.pass_stride, + frame_offset, 1.0f, task->nlm_k_2); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_construct_gramian_kernel()(dx, dy, + task->tile_info->frames[frame], blurDifference, (float*) task->buffer.mem.device_pointer, (float*) task->storage.transform.device_pointer, @@ -595,8 +606,17 @@ public: &task->reconstruction_state.filter_window.x, task->buffer.stride, 4, - task->buffer.pass_stride); + task->buffer.pass_stride, + frame_offset, + task->buffer.use_time); } + + return true; + } + + bool denoising_solve(device_ptr output_ptr, + DenoisingTask *task) + { for(int y = 0; y < task->filter_area.w; y++) { for(int x = 0; x < task->filter_area.z; x++) { filter_finalize_kernel()(x, @@ -661,6 +681,7 @@ public: int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task) { ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE); @@ -674,6 +695,7 @@ public: x, y, (float*) mean_ptr, (float*) variance_ptr, + scale, &task->rect.x, task->render_buffer.pass_stride, task->render_buffer.offset); @@ -682,6 +704,26 @@ public: return true; } + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) + { + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_write_feature_kernel()(task->render_buffer.samples, + x + task->filter_area.x, + y + task->filter_area.y, + &task->reconstruction_state.buffer_params.x, + (float*) from_ptr, + (float*) buffer_ptr, + out_offset, + &task->rect.x); + } + } + return true; + } + bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, @@ -754,11 +796,13 @@ public: tile.sample = tile.start_sample + tile.num_samples; denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 67f5793e793..ada538adf32 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1300,7 +1300,8 @@ public: int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); - int channel_offset = 0; + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; + int frame_offset = 0; if(have_error()) return false; @@ -1308,6 +1309,7 @@ public: CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts; + CUdeviceptr scale_ptr = 0; cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride)); cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride)); @@ -1326,10 +1328,10 @@ public: CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); - void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2}; + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1366,32 +1368,33 @@ public: task->storage.h); void *args[] = {&task->buffer.mem.device_pointer, + &task->tile_info_mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, &task->filter_area, &task->rect, &task->radius, &task->pca_threshold, - &task->buffer.pass_stride}; + &task->buffer.pass_stride, + &task->buffer.frame_stride, + &task->buffer.use_time}; CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); cuda_assert(cuCtxSynchronize()); return !have_error(); } - bool denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task) + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) { if(have_error()) return false; CUDAContextScope scope(this); - mem_zero(task->storage.XtWX); - mem_zero(task->storage.XtWY); - int r = task->radius; int f = 4; float a = 1.0f; @@ -1400,6 +1403,8 @@ public: int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); @@ -1410,60 +1415,73 @@ public: CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; - { - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, - task->reconstruction_state.source_w * task->reconstruction_state.source_h, - num_shifts); - - void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&blurDifference, - &task->buffer.mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->reconstruction_state.filter_window, - &w, &h, &stride, - &pass_stride, &r, - &f}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); - } + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, + &color_variance_ptr, + &scale_ptr, + &difference, + &w, &h, + &stride, &pass_stride, + &r, &pass_stride, + &frame_offset, + &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *construct_gramian_args[] = {&t, + &blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->reconstruction_state.filter_window, + &w, &h, &stride, + &pass_stride, &r, + &f, + &frame_offset, + &task->buffer.use_time}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); + cuda_assert(cuCtxSynchronize()); - { - CUfunction cuFinalize; - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - void *finalize_args[] = {&output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_GET_BLOCKSIZE(cuFinalize, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); - } + return !have_error(); + } + bool denoising_solve(device_ptr output_ptr, + DenoisingTask *task) + { + CUfunction cuFinalize; + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE(cuFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); cuda_assert(cuCtxSynchronize()); return !have_error(); @@ -1533,6 +1551,7 @@ public: int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task) { if(have_error()) @@ -1553,6 +1572,7 @@ public: &variance_offset, &mean_ptr, &variance_ptr, + &scale, &task->rect, &task->render_buffer.pass_stride, &task->render_buffer.offset}; @@ -1562,6 +1582,36 @@ public: return !have_error(); } + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterWriteFeature; + cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, + task->filter_area.z, + task->filter_area.w); + + void *args[] = {&task->render_buffer.samples, + &task->reconstruction_state.buffer_params, + &task->filter_area, + &from_ptr, + &buffer_ptr, + &out_offset, + &task->rect}; + CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, @@ -1596,11 +1646,13 @@ public: void denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 433cbd3c265..61e0ba47ab8 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -36,14 +36,28 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); } + render_buffer.frame_stride = task.frame_stride; render_buffer.pass_stride = task.pass_stride; render_buffer.offset = task.pass_denoising_data; - target_buffer.pass_stride = task.pass_stride; + target_buffer.pass_stride = task.target_pass_stride; target_buffer.denoising_clean_offset = task.pass_denoising_clean; + target_buffer.offset = 0; functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device); functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device); + + tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); + tile_info->from_render = task.denoising_from_render? 1 : 0; + + tile_info->frames[0] = 0; + tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES); + for(int i = 1; i < tile_info->num_frames; i++) { + tile_info->frames[i] = task.denoising_frames[i-1]; + } + + write_passes = task.denoising_write_passes; + do_filter = task.denoising_do_filter; } DenoisingTask::~DenoisingTask() @@ -59,8 +73,6 @@ DenoisingTask::~DenoisingTask() void DenoisingTask::set_render_buffer(RenderTile *rtiles) { - tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); - for(int i = 0; i < 9; i++) { tile_info->offsets[i] = rtiles[i].offset; tile_info->strides[i] = rtiles[i].stride; @@ -79,6 +91,13 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles) target_buffer.stride = rtiles[9].stride; target_buffer.ptr = rtiles[9].buffer; + if(write_passes && rtiles[9].buffers) { + target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset(); + } + else { + target_buffer.denoising_output_offset = 0; + } + tile_info_mem.copy_to_device(); } @@ -89,15 +108,18 @@ void DenoisingTask::setup_denoising_buffer() rect = rect_expand(rect, radius); rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - buffer.passes = 14; + buffer.use_intensity = write_passes || (tile_info->num_frames > 1); + buffer.passes = buffer.use_intensity? 15 : 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); + buffer.frame_stride = buffer.pass_stride * buffer.passes; /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ - int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats); + int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats); buffer.mem.alloc_to_device(mem_size, false); + buffer.use_time = (tile_info->num_frames > 1); /* CPUs process shifts sequentially while GPUs process them in parallel. */ int num_layers; @@ -129,14 +151,14 @@ void DenoisingTask::prefilter_shadowing() functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ - nlm_state.set_parameters(6, 3, 4.0f, 1.0f); + nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false); functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); /* Reuse memory, the previous data isn't needed anymore. */ device_ptr filtered_a = *buffer_var, filtered_b = *sample_var; /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ - nlm_state.set_parameters(5, 3, 1.0f, 0.25f); + nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false); functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); @@ -147,7 +169,7 @@ void DenoisingTask::prefilter_shadowing() device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b; /* Use the residual variance for a second filter pass. */ - nlm_state.set_parameters(4, 2, 1.0f, 0.5f); + nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false); functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); @@ -167,9 +189,9 @@ void DenoisingTask::prefilter_features() for(int pass = 0; pass < 7; pass++) { device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride); /* Get the unfiltered pass and its variance from the RenderBuffers. */ - functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance); + functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples); /* Smooth the pass and store the result in the denoising buffers. */ - nlm_state.set_parameters(2, 2, 1.0f, 0.25f); + nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false); functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); } } @@ -188,13 +210,52 @@ void DenoisingTask::prefilter_color() for(int pass = 0; pass < num_color_passes; pass++) { device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride); device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride); - functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass); + functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples); } device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride); device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride); functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); + + if(buffer.use_intensity) { + device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); + nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true); + functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass); + } +} + +void DenoisingTask::load_buffer() +{ + device_ptr null_ptr = (device_ptr) 0; + + int original_offset = render_buffer.offset; + + int num_passes = buffer.use_intensity? 15 : 14; + for(int i = 0; i < tile_info->num_frames; i++) { + for(int pass = 0; pass < num_passes; pass++) { + device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride); + bool is_variance = (pass >= 11) && (pass <= 13); + functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f); + } + render_buffer.offset += render_buffer.frame_stride; + } + + render_buffer.offset = original_offset; +} + +void DenoisingTask::write_buffer() +{ + reconstruction_state.buffer_params = make_int4(target_buffer.offset, + target_buffer.stride, + target_buffer.pass_stride, + target_buffer.denoising_clean_offset); + int num_passes = buffer.use_intensity? 15 : 14; + for(int pass = 0; pass < num_passes; pass++) { + device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride); + int out_offset = pass + target_buffer.denoising_output_offset; + functions.write_feature(out_offset, *from_pass, target_buffer.ptr); + } } void DenoisingTask::construct_transform() @@ -212,6 +273,8 @@ void DenoisingTask::reconstruct() { storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false); storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false); + storage.XtWX.zero_to_device(); + storage.XtWY.zero_to_device(); reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x; @@ -224,7 +287,18 @@ void DenoisingTask::reconstruct() device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); - functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr); + for(int f = 0; f < tile_info->num_frames; f++) { + device_ptr scale_ptr = 0; + device_sub_ptr *scale_sub_ptr = NULL; + if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) { + scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); + scale_ptr = **scale_sub_ptr; + } + + functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f); + delete scale_sub_ptr; + } + functions.solve(target_buffer.ptr); } void DenoisingTask::run_denoising(RenderTile *tile) @@ -236,12 +310,23 @@ void DenoisingTask::run_denoising(RenderTile *tile) setup_denoising_buffer(); - prefilter_shadowing(); - prefilter_features(); - prefilter_color(); + if(tile_info->from_render) { + prefilter_shadowing(); + prefilter_features(); + prefilter_color(); + } + else { + load_buffer(); + } + + if(do_filter) { + construct_transform(); + reconstruct(); + } - construct_transform(); - reconstruct(); + if(write_passes) { + write_buffer(); + } functions.unmap_neighbor_tiles(rtiles); } diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index beae60c220f..5869aa05390 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -38,6 +38,7 @@ public: struct RenderBuffers { int offset; int pass_stride; + int frame_stride; int samples; } render_buffer; @@ -47,6 +48,7 @@ public: int stride; int pass_stride; int denoising_clean_offset; + int denoising_output_offset; device_ptr ptr; } target_buffer; @@ -58,6 +60,9 @@ public: int4 rect; int4 filter_area; + bool write_passes; + bool do_filter; + struct DeviceFunctions { function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */ device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ @@ -66,8 +71,10 @@ public: )> non_local_means; function<bool(device_ptr color_ptr, device_ptr color_variance_ptr, - device_ptr output_ptr - )> reconstruct; + device_ptr scale_ptr, + int frame + )> accumulate; + function<bool(device_ptr output_ptr)> solve; function<bool()> construct_transform; function<bool(device_ptr a_ptr, @@ -86,13 +93,18 @@ public: function<bool(int mean_offset, int variance_offset, device_ptr mean_ptr, - device_ptr variance_ptr + device_ptr variance_ptr, + float scale )> get_feature; function<bool(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, device_ptr output_ptr )> detect_outliers; + function<bool(int out_offset, + device_ptr frop_ptr, + device_ptr buffer_ptr + )> write_feature; function<void(RenderTile *rtiles)> map_neighbor_tiles; function<void(RenderTile *rtiles)> unmap_neighbor_tiles; } functions; @@ -114,8 +126,9 @@ public: int f; /* Patch size of the filter. */ float a; /* Variance compensation factor in the MSE estimation. */ float k_2; /* Squared value of the k parameter of the filter. */ + bool is_color; - void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; } + void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; } } nlm_state; struct Storage { @@ -145,8 +158,11 @@ public: int stride; int h; int width; + int frame_stride; device_only_memory<float> mem; device_only_memory<float> temporary_mem; + bool use_time; + bool use_intensity; bool gpu_temporary_mem; @@ -166,6 +182,9 @@ protected: void prefilter_color(); void construct_transform(); void reconstruct(); + + void load_buffer(); + void write_buffer(); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 861014373b3..2871bc5761a 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -72,7 +72,15 @@ public: float denoising_strength; float denoising_feature_strength; bool denoising_relative_pca; + bool denoising_from_render; + vector<int> denoising_frames; + + bool denoising_do_filter; + bool denoising_write_passes; + int pass_stride; + int frame_stride; + int target_pass_stride; int pass_denoising_data; int pass_denoising_clean; diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index ea7ed4f1909..9b763167459 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -419,10 +419,13 @@ protected: device_ptr out_ptr, DenoisingTask *task); bool denoising_construct_transform(DenoisingTask *task); - bool denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task); + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task); + bool denoising_solve(device_ptr output_ptr, + DenoisingTask *task); bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, @@ -439,7 +442,12 @@ protected: int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task); + bool denoising_write_feature(int to_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task); bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index d4d7c0f74bc..4417065bb7f 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -748,6 +748,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); @@ -760,6 +761,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, cl_mem guide_mem = CL_MEM_PTR(guide_ptr); cl_mem variance_mem = CL_MEM_PTR(variance_ptr); cl_mem out_mem = CL_MEM_PTR(out_ptr); + cl_mem scale_mem = NULL; mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride); mem_zero_kernel(out_ptr, sizeof(float)*pass_stride); @@ -773,10 +775,12 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, kernel_set_args(ckNLMCalcDifference, 0, guide_mem, variance_mem, + scale_mem, difference_mem, w, h, stride, pass_stride, - r, 0, a, k_2); + r, channel_offset, + 0, a, k_2); kernel_set_args(ckNLMBlur, 0, difference_mem, blurDifference_mem, @@ -796,6 +800,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, weightAccum_mem, w, h, stride, pass_stride, + channel_offset, r, f); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); @@ -816,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + char use_time = task->buffer.use_time? 1 : 0; cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - kernel_set_args(ckFilterConstructTransform, 0, - buffer_mem, + int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterConstructTransform, + arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterConstructTransform, + arg_ofs, transform_mem, rank_mem, task->filter_area, task->rect, task->buffer.pass_stride, + task->buffer.frame_stride, + use_time, task->radius, task->pca_threshold); @@ -837,17 +857,15 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) return true; } -bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task) +bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) { - mem_zero(task->storage.XtWX); - mem_zero(task->storage.XtWY); - cl_mem color_mem = CL_MEM_PTR(color_ptr); cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); - cl_mem output_mem = CL_MEM_PTR(output_ptr); + cl_mem scale_mem = CL_MEM_PTR(scale_ptr); cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); @@ -859,11 +877,13 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); - cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + char use_time = task->buffer.use_time? 1 : 0; int r = task->radius; int pass_stride = task->buffer.pass_stride; @@ -877,11 +897,13 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, kernel_set_args(ckNLMCalcDifference, 0, color_mem, color_variance_mem, + scale_mem, difference_mem, w, h, stride, pass_stride, r, pass_stride, + frame_offset, 1.0f, task->nlm_k_2); kernel_set_args(ckNLMBlur, 0, difference_mem, @@ -896,6 +918,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, pass_stride, r, 4); kernel_set_args(ckNLMConstructGramian, 0, + t, blurDifference_mem, buffer_mem, transform_mem, @@ -905,7 +928,9 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, task->reconstruction_state.filter_window, w, h, stride, pass_stride, - r, 4); + r, 4, + frame_offset, + use_time); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); @@ -913,6 +938,22 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); + return true; +} + +bool OpenCLDeviceBase::denoising_solve(device_ptr output_ptr, + DenoisingTask *task) +{ + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem output_mem = CL_MEM_PTR(output_ptr); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + kernel_set_args(ckFinalize, 0, output_mem, rank_mem, @@ -1000,6 +1041,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task) { cl_mem mean_mem = CL_MEM_PTR(mean_ptr); @@ -1023,6 +1065,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, variance_offset, mean_mem, variance_mem, + scale, task->rect, task->render_buffer.pass_stride, task->render_buffer.offset); @@ -1033,6 +1076,31 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, return true; } +bool OpenCLDeviceBase::denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) +{ + cl_mem from_mem = CL_MEM_PTR(from_ptr); + cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); + + cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); + + kernel_set_args(ckFilterWriteFeature, 0, + task->render_buffer.samples, + task->reconstruction_state.buffer_params, + task->filter_area, + from_mem, + buffer_mem, + out_offset, + task->rect); + enqueue_kernel(ckFilterWriteFeature, + task->filter_area.z, + task->filter_area.w); + + return true; +} + bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, @@ -1063,11 +1131,13 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&OpenCLDeviceBase::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h index 67f4e62ac0f..cb04aac35f4 100644 --- a/intern/cycles/kernel/filter/filter_defines.h +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -17,16 +17,21 @@ #ifndef __FILTER_DEFINES_H__ #define __FILTER_DEFINES_H__ -#define DENOISE_FEATURES 10 +#define DENOISE_FEATURES 11 #define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) #define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) #define XTWY_SIZE (DENOISE_FEATURES+1) +#define DENOISE_MAX_FRAMES 16 + typedef struct TileInfo { int offsets[9]; int strides[9]; int x[4]; int y[4]; + int from_render; + int frames[DENOISE_MAX_FRAMES]; + int num_frames; /* TODO(lukas): CUDA doesn't have uint64_t... */ #ifdef __KERNEL_OPENCL__ ccl_global float *buffers[9]; diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h index 6226ed2c2ef..e1ea6487aa9 100644 --- a/intern/cycles/kernel/filter/filter_features.h +++ b/intern/cycles/kernel/filter/filter_features.h @@ -18,19 +18,23 @@ #define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] -/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). - * pixel_buffer always points to the current pixel in the first pass. */ -#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ - for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass. + * Repeat the loop for every secondary frame if there are any. */ +#define FOR_PIXEL_WINDOW for(int frame = 0; frame < tile_info->num_frames; frame++) { \ + pixel.z = tile_info->frames[frame]; \ + pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { -#define END_FOR_PIXEL_WINDOW } \ - pixel_buffer += buffer_w - (high.x - low.x); \ +#define END_FOR_PIXEL_WINDOW } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } \ } -ccl_device_inline void filter_get_features(int2 pixel, +ccl_device_inline void filter_get_features(int3 pixel, const ccl_global float *ccl_restrict buffer, float *features, + bool use_time, const float *ccl_restrict mean, int pass_stride) { @@ -44,15 +48,20 @@ ccl_device_inline void filter_get_features(int2 pixel, features[7] = ccl_get_feature(buffer, 5); features[8] = ccl_get_feature(buffer, 6); features[9] = ccl_get_feature(buffer, 7); + if(use_time) { + features[10] = pixel.z; + } if(mean) { - for(int i = 0; i < DENOISE_FEATURES; i++) + for(int i = 0; i < (use_time? 11 : 10); i++) { features[i] -= mean[i]; + } } } -ccl_device_inline void filter_get_feature_scales(int2 pixel, +ccl_device_inline void filter_get_feature_scales(int3 pixel, const ccl_global float *ccl_restrict buffer, float *scales, + bool use_time, const float *ccl_restrict mean, int pass_stride) { @@ -66,13 +75,19 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel, scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], ccl_get_feature(buffer, 6) - mean[8], ccl_get_feature(buffer, 7) - mean[9])); + if(use_time) { + scales[6] = fabsf(pixel.z - mean[10]); + } } -ccl_device_inline void filter_calculate_scale(float *scale) +ccl_device_inline void filter_calculate_scale(float *scale, bool use_time) { scale[0] = 1.0f/max(scale[0], 0.01f); scale[1] = 1.0f/max(scale[1], 0.01f); scale[2] = 1.0f/max(scale[2], 0.01f); + if(use_time) { + scale[10] = 1.0f/max(scale[6], 0.01f); + } scale[6] = 1.0f/max(scale[4], 0.01f); scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); @@ -89,36 +104,46 @@ ccl_device_inline void design_row_add(float *design_row, const ccl_global float *ccl_restrict transform, int stride, int row, - float feature) + float feature, + int transform_row_stride) { for(int i = 0; i < rank; i++) { - design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature; } } /* Fill the design row. */ -ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, +ccl_device_inline void filter_get_design_row_transform(int3 p_pixel, const ccl_global float *ccl_restrict p_buffer, - int2 q_pixel, + int3 q_pixel, const ccl_global float *ccl_restrict q_buffer, int pass_stride, int rank, float *design_row, const ccl_global float *ccl_restrict transform, - int stride) + int stride, + bool use_time) { + int num_features = use_time? 11 : 10; + design_row[0] = 1.0f; math_vector_zero(design_row+1, rank); - design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); - design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); - design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); - design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); - design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); - design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); - design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); - design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); - design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); - design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); + +#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features); + DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x); + DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y); + DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); + DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); + if(use_time) { + DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z) + } +#undef DESIGN_ROW_ADD } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 3ddd8712266..5dd001ffb93 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -20,26 +20,33 @@ CCL_NAMESPACE_BEGIN /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. * pixel_buffer always points to the first of the 4 current pixel in the first pass. - * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. + * Repeat the loop for every secondary frame if there are any. */ +#define FOR_PIXEL_WINDOW_SSE for(int frame = 0; frame < tile_info->num_frames; frame++) { \ + pixel.z = tile_info->frames[frame]; \ + pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \ + float4 t4 = make_float4(pixel.z); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + float4 y4 = make_float4(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); -#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ - for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - float4 y4 = make_float4(pixel.y); \ - for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ - int4 active_pixels = x4 < make_float4(high.x); - -#define END_FOR_PIXEL_WINDOW_SSE } \ - pixel_buffer += buffer_w - (pixel.x - low.x); \ +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } \ } -ccl_device_inline void filter_get_features_sse(float4 x, float4 y, +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *features, + bool use_time, const float4 *ccl_restrict mean, int pass_stride) { + int num_features = use_time? 11 : 10; + features[0] = x; features[1] = y; features[2] = fabs(ccl_get_feature_sse(0)); @@ -50,18 +57,25 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y, features[7] = ccl_get_feature_sse(5); features[8] = ccl_get_feature_sse(6); features[9] = ccl_get_feature_sse(7); + if(use_time) { + features[10] = t; + } + if(mean) { - for(int i = 0; i < DENOISE_FEATURES; i++) + for(int i = 0; i < num_features; i++) { features[i] = features[i] - mean[i]; + } } - for(int i = 0; i < DENOISE_FEATURES; i++) + for(int i = 0; i < num_features; i++) { features[i] = mask(active_pixels, features[i]); + } } -ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *scales, + bool use_time, const float4 *ccl_restrict mean, int pass_stride) { @@ -75,15 +89,22 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) + sqr(ccl_get_feature_sse(7) - mean[9]); - for(int i = 0; i < 6; i++) + if(use_time) { + scales[6] = fabs(t - mean[10]); + } + + for(int i = 0; i < (use_time? 7 : 6); i++) scales[i] = mask(active_pixels, scales[i]); } -ccl_device_inline void filter_calculate_scale_sse(float4 *scale) +ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time) { scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + if(use_time) { + scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));; + } scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index af73c0dadf2..9eb3c603a4a 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -22,10 +22,12 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, + const float *ccl_restrict scale_image, float *difference_image, int4 rect, int stride, int channel_offset, + int frame_offset, float a, float k_2) { @@ -38,16 +40,24 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, for(int y = rect.y; y < rect.w; y++) { int idx_p = y*stride + aligned_lowx; - int idx_q = (y+dy)*stride + aligned_lowx + dx; + int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset; for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { float4 diff = make_float4(0.0f); + float4 scale_fac; + if(scale_image) { + scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), + make_float4(0.25f), make_float4(4.0f)); + } + else { + scale_fac = make_float4(1.0f); + } for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { /* idx_p is guaranteed to be aligned, but idx_q isn't. */ float4 color_p = load4_a(weight_image, idx_p + chan_ofs); - float4 color_q = load4_u(weight_image, idx_q + chan_ofs); + float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs); float4 cdiff = color_p - color_q; float4 var_p = load4_a(variance_image, idx_p + chan_ofs); - float4 var_q = load4_u(variance_image, idx_q + chan_ofs); + float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs); diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); } load4_a(difference_image, idx_p) = diff*channel_fac; @@ -143,6 +153,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float *out_image, float *accum_image, int4 rect, + int channel_offset, int stride, int f) { @@ -160,13 +171,18 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, load4_a(accum_image, idx_p) += mask(active, weight); float4 val = load4_u(image, idx_q); + if(channel_offset) { + val += load4_u(image, idx_q + channel_offset); + val += load4_u(image, idx_q + 2*channel_offset); + val *= 1.0f/3.0f; + } load4_a(out_image, idx_p) += mask(active, weight*val); } } } -ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t, const float *ccl_restrict difference_image, const float *ccl_restrict buffer, float *transform, @@ -176,7 +192,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int4 rect, int4 filter_window, int stride, int f, - int pass_stride) + int pass_stride, + int frame_offset, + bool use_time) { int4 clip_area = rect_clip(rect, filter_window); /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ @@ -197,9 +215,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int *l_rank = rank + storage_ofs; kernel_filter_construct_gramian(x, y, 1, - dx, dy, + dx, dy, t, stride, pass_stride, + frame_offset, + use_time, buffer, l_transform, l_rank, weight, l_XtWX, l_XtWY, 0); diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index 058afb34a92..12636393243 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -78,17 +78,26 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, int dy, const ccl_global float *ccl_restrict weight_image, const ccl_global float *ccl_restrict variance_image, + const ccl_global float *ccl_restrict scale_image, ccl_global float *difference_image, int4 rect, int stride, int channel_offset, + int frame_offset, float a, float k_2) { - float diff = 0.0f; + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset; int numChannels = channel_offset? 3 : 1; - for(int c = 0; c < numChannels; c++) { - float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - float pvar = variance_image[c*channel_offset + y*stride + x]; - float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + + float diff = 0.0f; + float scale_fac = 1.0f; + if(scale_image) { + scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); + } + + for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { + float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q]; + float pvar = variance_image[idx_p]; + float qvar = sqr(scale_fac)*variance_image[idx_q]; diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); } if(numChannels > 1) { @@ -133,7 +142,8 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, const ccl_global float *ccl_restrict image, ccl_global float *out_image, ccl_global float *accum_image, - int4 rect, int stride, int f) + int4 rect, int channel_offset, + int stride, int f) { float sum = 0.0f; const int low = max(rect.x, x-f); @@ -142,17 +152,26 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, sum += difference_image[y*stride + x1]; } sum *= 1.0f/(high-low); + + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); if(out_image) { - atomic_add_and_fetch_float(accum_image + y*stride + x, sum); - atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]); + atomic_add_and_fetch_float(accum_image + idx_p, sum); + + float val = image[idx_q]; + if(channel_offset) { + val += image[idx_q + channel_offset]; + val += image[idx_q + 2*channel_offset]; + val *= 1.0f/3.0f; + } + atomic_add_and_fetch_float(out_image + idx_p, sum*val); } else { - accum_image[y*stride + x] = sum; + accum_image[idx_p] = sum; } } ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, - int dx, int dy, + int dx, int dy, int t, const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, @@ -163,6 +182,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, int4 filter_window, int stride, int f, int pass_stride, + int frame_offset, + bool use_time, int localIdx) { const int low = max(rect.x, x-f); @@ -183,9 +204,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, kernel_filter_construct_gramian(x, y, rect_size(filter_window), - dx, dy, + dx, dy, t, stride, pass_stride, + frame_offset, + use_time, buffer, transform, rank, weight, XtWX, XtWY, diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 3507f80df46..e24f4feb28d 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -84,6 +84,7 @@ ccl_device void kernel_filter_get_feature(int sample, int x, int y, ccl_global float *mean, ccl_global float *variance, + float scale, int4 rect, int buffer_pass_stride, int buffer_denoising_offset) { @@ -95,18 +96,38 @@ ccl_device void kernel_filter_get_feature(int sample, int buffer_w = align_up(rect.z - rect.x, 4); int idx = (y-rect.y)*buffer_w + (x - rect.x); - mean[idx] = center_buffer[m_offset] / sample; - if(sample > 1) { - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); - } - else { - /* Can't compute variance with single sample, just set it very high. */ - variance[idx] = 1e10f; + float val = scale * center_buffer[m_offset]; + mean[idx] = val; + + if(v_offset >= 0) { + if(sample > 1) { + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + variance[idx] = max(0.0f, (center_buffer[v_offset] - val*val*sample) / (sample * (sample-1))); + } + else { + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; + } } } +ccl_device void kernel_filter_write_feature(int sample, + int x, int y, + int4 buffer_params, + ccl_global float *from, + ccl_global float *buffer, + int out_offset, + int4 rect) +{ + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + combined_buffer[out_offset] = from[idx]; +} + ccl_device void kernel_filter_detect_outliers(int x, int y, ccl_global float *image, ccl_global float *variance, @@ -119,6 +140,7 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, int n = 0; float values[25]; + float pixel_variance, max_variance = 0.0f; for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { int idx = (y1-rect.y)*buffer_w + (x1-rect.x); @@ -138,15 +160,31 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, /* Insert L. */ values[i] = L; n++; + + float3 pixel_var = make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]); + float var = average(pixel_var); + if((x1 == x) && (y1 == y)) { + pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f)? -1.0f : var; + } + else { + max_variance = max(max_variance, var); + } } } + max_variance += 1e-4f; + int idx = (y-rect.y)*buffer_w + (x-rect.x); float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); color = max(color, make_float3(0.0f, 0.0f, 0.0f)); float L = average(color); float ref = 2.0f*values[(int)(n*0.75f)]; + + /* Slightly offset values to avoid false positives in (almost) black areas. */ + max_variance += 1e-5f; + ref -= 1e-5f; + if(L > ref) { /* The pixel appears to be an outlier. * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel @@ -154,16 +192,24 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. */ - float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); - if(L - 3*stddev < ref) { - /* The pixel is an outlier, so negate the depth value to mark it as one. - * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + + if(pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) { depth[idx] = -depth[idx]; - float fac = ref/L; - color *= fac; - variance[idx ] *= fac*fac; - variance[idx + pass_stride] *= fac*fac; - variance[idx+2*pass_stride] *= fac*fac; + color *= ref/L; + variance[idx] = variance[idx + pass_stride] = variance[idx + 2*pass_stride] = max_variance; + } + else { + float stddev = sqrtf(pixel_variance); + if(L - 3*stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + float fac = ref/L; + color *= fac; + variance[idx ] *= fac*fac; + variance[idx + pass_stride] *= fac*fac; + variance[idx+2*pass_stride] *= fac*fac; + } } } out[idx ] = color.x; diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index 58740d5b06a..31a7487c77a 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -18,9 +18,11 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int storage_stride, - int dx, int dy, + int dx, int dy, int t, int buffer_stride, int pass_stride, + int frame_offset, + bool use_time, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, ccl_global int *rank, @@ -34,7 +36,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, } int p_offset = y * buffer_stride + x; - int q_offset = (y+dy) * buffer_stride + (x+dx); + int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset; #ifdef __KERNEL_GPU__ const int stride = storage_stride; @@ -57,9 +59,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, return; } - filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, - make_int2(x+dx, y+dy), buffer + q_offset, - pass_stride, *rank, design_row, transform, stride); + filter_get_design_row_transform(make_int3(x, y, t), buffer + p_offset, + make_int3(x+dx, y+dy, t), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride, use_time); #ifdef __KERNEL_GPU__ math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); @@ -108,11 +110,13 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; - final_color *= sample; - if(buffer_params.w) { - final_color.x += combined_buffer[buffer_params.w+0]; - final_color.y += combined_buffer[buffer_params.w+1]; - final_color.z += combined_buffer[buffer_params.w+2]; + if(buffer_params.w >= 0) { + final_color *= sample; + if(buffer_params.w > 0) { + final_color.x += combined_buffer[buffer_params.w+0]; + final_color.y += combined_buffer[buffer_params.w+1]; + final_color.z += combined_buffer[buffer_params.w+2]; + } } combined_buffer[0] = final_color.x; combined_buffer[1] = final_color.y; diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h index a5f87c05ec0..94e27bb02fd 100644 --- a/intern/cycles/kernel/filter/filter_transform.h +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -17,8 +17,10 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, int x, int y, int4 rect, - int pass_stride, + int pass_stride, int frame_stride, + bool use_time, float *transform, int *rank, int radius, float pca_threshold) { @@ -26,59 +28,58 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff float features[DENOISE_FEATURES]; - /* Temporary storage, used in different steps of the algorithm. */ - float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; - float tempvector[2*DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; - int2 pixel; + int3 pixel; + + int num_features = use_time? 11 : 10; /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; /* === Shift feature passes to have mean 0. === */ float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, DENOISE_FEATURES); + math_vector_zero(feature_means, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); - math_vector_add(feature_means, features, DENOISE_FEATURES); + filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add(feature_means, features, num_features); } END_FOR_PIXEL_WINDOW - math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + math_vector_scale(feature_means, 1.0f / num_pixels, num_features); /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ - float *feature_scale = tempvector; - math_vector_zero(feature_scale, DENOISE_FEATURES); + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, num_features); FOR_PIXEL_WINDOW { - filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_max(feature_scale, features, DENOISE_FEATURES); + filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max(feature_scale, features, num_features); } END_FOR_PIXEL_WINDOW - filter_calculate_scale(feature_scale); + filter_calculate_scale(feature_scale, use_time); /* === Generate the feature transformation. === - * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ - float* feature_matrix = tempmatrix; - math_matrix_zero(feature_matrix, DENOISE_FEATURES); + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_mul(features, feature_scale, DENOISE_FEATURES); - math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul(features, feature_scale, num_features); + math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); } END_FOR_PIXEL_WINDOW - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); *rank = 0; /* Prevent overfitting when a small window is used. */ - int max_rank = min(DENOISE_FEATURES, num_pixels/3); + int max_rank = min(num_features, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++) { - threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + for(int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i*num_features+i]; } threshold_energy *= 1.0f - (-pca_threshold); @@ -86,13 +87,13 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; reduced_energy += s; } } else { for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; } @@ -100,9 +101,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff /* Bake the feature scaling into the transformation matrix. */ for(int i = 0; i < (*rank); i++) { - math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); + math_vector_mul(transform + i*num_features, feature_scale, num_features); } - math_matrix_transpose(transform, DENOISE_FEATURES, 1); + math_matrix_transpose(transform, num_features, 1); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h index 83a1222bbdb..ed8ddcb49b1 100644 --- a/intern/cycles/kernel/filter/filter_transform_gpu.h +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -17,8 +17,10 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, int x, int y, int4 rect, - int pass_stride, + int pass_stride, int frame_stride, + bool use_time, ccl_global float *transform, ccl_global int *rank, int radius, float pca_threshold, @@ -33,60 +35,62 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re float features[DENOISE_FEATURES]; #endif + int num_features = use_time? 11 : 10; + /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; const ccl_global float *ccl_restrict pixel_buffer; - int2 pixel; + int3 pixel; /* === Shift feature passes to have mean 0. === */ float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, DENOISE_FEATURES); + math_vector_zero(feature_means, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); - math_vector_add(feature_means, features, DENOISE_FEATURES); + filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add(feature_means, features, num_features); } END_FOR_PIXEL_WINDOW - math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + math_vector_scale(feature_means, 1.0f / num_pixels, num_features); /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ float feature_scale[DENOISE_FEATURES]; - math_vector_zero(feature_scale, DENOISE_FEATURES); + math_vector_zero(feature_scale, num_features); FOR_PIXEL_WINDOW { - filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_max(feature_scale, features, DENOISE_FEATURES); + filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max(feature_scale, features, num_features); } END_FOR_PIXEL_WINDOW - filter_calculate_scale(feature_scale); + filter_calculate_scale(feature_scale, use_time); /* === Generate the feature transformation. === - * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_zero(feature_matrix, DENOISE_FEATURES); + math_matrix_zero(feature_matrix, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_mul(features, feature_scale, DENOISE_FEATURES); - math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul(features, feature_scale, num_features); + math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); } END_FOR_PIXEL_WINDOW - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride); *rank = 0; /* Prevent overfitting when a small window is used. */ - int max_rank = min(DENOISE_FEATURES, num_pixels/3); + int max_rank = min(num_features, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++) { - threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + for(int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i*num_features+i]; } threshold_energy *= 1.0f - (-pca_threshold); @@ -94,24 +98,24 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; reduced_energy += s; } } else { for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; } } - math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride); + math_matrix_transpose(transform, num_features, transform_stride); /* Bake the feature scaling into the transformation matrix. */ - for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int i = 0; i < num_features; i++) { for(int j = 0; j < (*rank); j++) { - transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i]; + transform[(i*num_features + j)*transform_stride] *= feature_scale[i]; } } } diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 9e65f61664b..10bd3e477e9 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -17,8 +17,10 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, int x, int y, int4 rect, - int pass_stride, + int pass_stride, int frame_stride, + bool use_time, float *transform, int *rank, int radius, float pca_threshold) { @@ -26,55 +28,63 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff float4 features[DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; - int2 pixel; + int3 pixel; + int num_features = use_time? 11 : 10; + + /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; + /* === Shift feature passes to have mean 0. === */ float4 feature_means[DENOISE_FEATURES]; - math_vector_zero_sse(feature_means, DENOISE_FEATURES); + math_vector_zero_sse(feature_means, num_features); FOR_PIXEL_WINDOW_SSE { - filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); - math_vector_add_sse(feature_means, DENOISE_FEATURES, features); + filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add_sse(feature_means, num_features, features); } END_FOR_PIXEL_WINDOW_SSE float4 pixel_scale = make_float4(1.0f / num_pixels); - for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int i = 0; i < num_features; i++) { feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; } + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ float4 feature_scale[DENOISE_FEATURES]; - math_vector_zero_sse(feature_scale, DENOISE_FEATURES); + math_vector_zero_sse(feature_scale, num_features); FOR_PIXEL_WINDOW_SSE { - filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); - math_vector_max_sse(feature_scale, features, DENOISE_FEATURES); + filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, num_features); } END_FOR_PIXEL_WINDOW_SSE - filter_calculate_scale_sse(feature_scale); + filter_calculate_scale_sse(feature_scale, use_time); + /* === Generate the feature transformation. === + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); + math_matrix_zero_sse(feature_matrix_sse, num_features); FOR_PIXEL_WINDOW_SSE { - filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); - math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); + filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul_sse(features, num_features, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f)); } END_FOR_PIXEL_WINDOW_SSE float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse); + math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse); - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); *rank = 0; /* Prevent overfitting when a small window is used. */ - int max_rank = min(DENOISE_FEATURES, num_pixels/3); + int max_rank = min(num_features, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++) { - threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + for(int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i*num_features+i]; } threshold_energy *= 1.0f - (-pca_threshold); @@ -82,23 +92,23 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; reduced_energy += s; } } else { for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; } } - math_matrix_transpose(transform, DENOISE_FEATURES, 1); + math_matrix_transpose(transform, num_features, 1); /* Bake the feature scaling into the transformation matrix. */ - for(int i = 0; i < DENOISE_FEATURES; i++) { - math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); + for(int i = 0; i < num_features; i++) { + math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank); } } diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 864aa7c470a..caa0057d997 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -472,8 +472,17 @@ typedef enum DenoisingPassOffsets { DENOISING_PASS_COLOR_VAR = 23, DENOISING_PASS_CLEAN = 26, + DENOISING_PASS_PREFILTERED_DEPTH = 0, + DENOISING_PASS_PREFILTERED_NORMAL = 1, + DENOISING_PASS_PREFILTERED_SHADOWING = 4, + DENOISING_PASS_PREFILTERED_ALBEDO = 5, + DENOISING_PASS_PREFILTERED_COLOR = 8, + DENOISING_PASS_PREFILTERED_VARIANCE = 11, + DENOISING_PASS_PREFILTERED_INTENSITY = 14, + DENOISING_PASS_SIZE_BASE = 26, DENOISING_PASS_SIZE_CLEAN = 3, + DENOISING_PASS_SIZE_PREFILTERED = 15, } DenoisingPassOffsets; typedef enum eBakePassFilter { diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index e036b53b810..02c85562db8 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -37,10 +37,20 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, int y, float *mean, float *variance, + float scale, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset); +void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, + int x, + int y, + int *buffer_params, + float *from, + float *buffer, + int out_offset, + int* prefilter_rect); + void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, ccl_global float *image, ccl_global float *variance, @@ -58,6 +68,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, int r); void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + TileInfo *tiles, int x, int y, int storage_ofs, @@ -65,17 +76,21 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, int *rank, int* rect, int pass_stride, + int frame_stride, + bool use_time, int radius, float pca_threshold); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int dy, float *weight_image, - float *variance, + float *variance_image, + float *scale_image, float *difference_image, int* rect, int stride, int channel_offset, + int frame_offset, float a, float k_2); @@ -99,11 +114,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, float *out_image, float *accum_image, int* rect, + int channel_offset, int stride, int f); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, + int t, float *difference_image, float *buffer, float *transform, @@ -114,7 +131,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int *filter_window, int stride, int f, - int pass_stride); + int pass_stride, + int frame_offset, + bool use_time); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, float *accum_image, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 4c758711481..c29505880cb 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -69,6 +69,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, int x, int y, float *mean, float *variance, + float scale, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) @@ -80,12 +81,29 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, m_offset, v_offset, x, y, mean, variance, + scale, load_int4(prefilter_rect), buffer_pass_stride, buffer_denoising_offset); #endif } +void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, + int x, + int y, + int *buffer_params, + float *from, + float *buffer, + int out_offset, + int* prefilter_rect) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_write_feature); +#else + kernel_filter_write_feature(sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect)); +#endif +} + void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, ccl_global float *image, ccl_global float *variance, @@ -117,6 +135,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, } void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + TileInfo *tile_info, int x, int y, int storage_ofs, @@ -124,18 +143,23 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, int *rank, int* prefilter_rect, int pass_stride, + int frame_stride, + bool use_time, int radius, float pca_threshold) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); #else - rank += storage_ofs; - transform += storage_ofs*TRANSFORM_SIZE; + rank += storage_ofs; + transform += storage_ofs*TRANSFORM_SIZE; kernel_filter_construct_transform(buffer, + tile_info, x, y, load_int4(prefilter_rect), pass_stride, + frame_stride, + use_time, transform, rank, radius, @@ -146,18 +170,29 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int dy, float *weight_image, - float *variance, + float *variance_image, + float *scale_image, float *difference_image, int *rect, int stride, int channel_offset, + int frame_offset, float a, float k_2) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); #else - kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2); + kernel_filter_nlm_calc_difference(dx, dy, + weight_image, + variance_image, + scale_image, + difference_image, + load_int4(rect), + stride, + channel_offset, + frame_offset, + a, k_2); #endif } @@ -195,18 +230,28 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, float *out_image, float *accum_image, int *rect, + int channel_offset, int stride, int f) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f); + kernel_filter_nlm_update_output(dx, dy, + difference_image, + image, + temp_image, + out_image, + accum_image, + load_int4(rect), + channel_offset, + stride, f); #endif } void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, + int t, float *difference_image, float *buffer, float *transform, @@ -217,12 +262,24 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int *filter_window, int stride, int f, - int pass_stride) + int pass_stride, + int frame_offset, + bool use_time) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); #else - kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride); + kernel_filter_nlm_construct_gramian(dx, dy, t, + difference_image, + buffer, + transform, rank, + XtWX, XtWY, + load_int4(rect), + load_int4(filter_window), + stride, f, + pass_stride, + frame_offset, + use_time); #endif } diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index b856cbde45c..5b552b01413 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -29,7 +29,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_divide_shadow(int sample, - TileInfo *tile_info, + CCL_FILTER_TILE_INFO, float *unfilteredA, float *unfilteredB, float *sampleVariance, @@ -59,11 +59,12 @@ kernel_cuda_filter_divide_shadow(int sample, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_get_feature(int sample, - TileInfo *tile_info, + CCL_FILTER_TILE_INFO, int m_offset, int v_offset, float *mean, float *variance, + float scale, int4 prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) @@ -76,6 +77,7 @@ kernel_cuda_filter_get_feature(int sample, m_offset, v_offset, x, y, mean, variance, + scale, prefilter_rect, buffer_pass_stride, buffer_denoising_offset); @@ -84,6 +86,30 @@ kernel_cuda_filter_get_feature(int sample, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_write_feature(int sample, + int4 buffer_params, + int4 filter_area, + float *from, + float *buffer, + int out_offset, + int4 prefilter_rect) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + kernel_filter_write_feature(sample, + x + filter_area.x, + y + filter_area.y, + buffer_params, + from, + buffer, + out_offset, + prefilter_rect); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_detect_outliers(float *image, float *variance, float *depth, @@ -112,10 +138,12 @@ kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, + CCL_FILTER_TILE_INFO, float *transform, int *rank, int4 filter_area, int4 rect, int radius, float pca_threshold, - int pass_stride) + int pass_stride, int frame_stride, + bool use_time) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; @@ -123,8 +151,11 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, int *l_rank = rank + y*filter_area.z + x; float *l_transform = transform + y*filter_area.z + x; kernel_filter_construct_transform(buffer, + tile_info, x + filter_area.x, y + filter_area.y, - rect, pass_stride, + rect, + pass_stride, frame_stride, + use_time, l_transform, l_rank, radius, pca_threshold, filter_area.z*filter_area.w, @@ -136,6 +167,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, + const float *ccl_restrict scale_image, float *difference_image, int w, int h, @@ -143,6 +175,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, int pass_stride, int r, int channel_offset, + int frame_offset, float a, float k_2) { @@ -152,9 +185,12 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, + scale_image, difference_image + ofs, rect, stride, - channel_offset, a, k_2); + channel_offset, + frame_offset, + a, k_2); } } @@ -210,6 +246,7 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, int h, int stride, int pass_stride, + int channel_offset, int r, int f) { @@ -221,7 +258,9 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, image, out_image, accum_image, - rect, stride, f); + rect, + channel_offset, + stride, f); } } @@ -242,7 +281,8 @@ kernel_cuda_filter_nlm_normalize(float *out_image, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image, +kernel_cuda_filter_nlm_construct_gramian(int t, + const float *ccl_restrict difference_image, const float *ccl_restrict buffer, float const* __restrict__ transform, int *rank, @@ -254,13 +294,16 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im int stride, int pass_stride, int r, - int f) + int f, + int frame_offset, + bool use_time) { int4 co, rect; int ofs; if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, + t, difference_image + ofs, buffer, transform, rank, @@ -268,6 +311,8 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im rect, filter_window, stride, f, pass_stride, + frame_offset, + use_time, threadIdx.y*blockDim.x + threadIdx.x); } } diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index a550f97f4eb..996bc27f71b 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -56,6 +56,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample, int v_offset, ccl_global float *mean, ccl_global float *variance, + float scale, int4 prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) @@ -68,12 +69,35 @@ __kernel void kernel_ocl_filter_get_feature(int sample, m_offset, v_offset, x, y, mean, variance, + scale, prefilter_rect, buffer_pass_stride, buffer_denoising_offset); } } +__kernel void kernel_ocl_filter_write_feature(int sample, + int4 buffer_params, + int4 filter_area, + ccl_global float *from, + ccl_global float *buffer, + int out_offset, + int4 prefilter_rect) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + kernel_filter_write_feature(sample, + x + filter_area.x, + y + filter_area.y, + buffer_params, + from, + buffer, + out_offset, + prefilter_rect); + } +} + __kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image, ccl_global float *variance, ccl_global float *depth, @@ -103,11 +127,14 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, } __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, ccl_global float *transform, ccl_global int *rank, int4 filter_area, int4 rect, int pass_stride, + int frame_stride, + char use_time, int radius, float pca_threshold) { @@ -117,8 +144,11 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_ ccl_global int *l_rank = rank + y*filter_area.z + x; ccl_global float *l_transform = transform + y*filter_area.z + x; kernel_filter_construct_transform(buffer, + CCL_FILTER_TILE_INFO_ARG, x + filter_area.x, y + filter_area.y, - rect, pass_stride, + rect, + pass_stride, frame_stride, + use_time, l_transform, l_rank, radius, pca_threshold, filter_area.z*filter_area.w, @@ -128,6 +158,7 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image, const ccl_global float *ccl_restrict variance_image, + const ccl_global float *ccl_restrict scale_image, ccl_global float *difference_image, int w, int h, @@ -135,6 +166,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ int pass_stride, int r, int channel_offset, + int frame_offset, float a, float k_2) { @@ -144,9 +176,12 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, + scale_image, difference_image + ofs, rect, stride, - channel_offset, a, k_2); + channel_offset, + frame_offset, + a, k_2); } } @@ -196,6 +231,7 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re int h, int stride, int pass_stride, + int channel_offset, int r, int f) { @@ -207,7 +243,9 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re image, out_image, accum_image, - rect, stride, f); + rect, + channel_offset, + stride, f); } } @@ -224,7 +262,8 @@ __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, } } -__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image, +__kernel void kernel_ocl_filter_nlm_construct_gramian(int t, + const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, ccl_global int *rank, @@ -236,13 +275,16 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc int stride, int pass_stride, int r, - int f) + int f, + int frame_offset, + char use_time) { int4 co, rect; int ofs; if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, + t, difference_image + ofs, buffer, transform, rank, @@ -250,6 +292,8 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc rect, filter_window, stride, f, pass_stride, + frame_offset, + use_time, get_local_id(1)*get_local_size(0) + get_local_id(0)); } } diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index b4e3c18e894..0a60fcec096 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -42,6 +42,7 @@ BufferParams::BufferParams() denoising_data_pass = false; denoising_clean_pass = false; + denoising_prefiltered_pass = false; Pass::add(PASS_COMBINED, passes); } @@ -73,6 +74,7 @@ int BufferParams::get_passes_size() if(denoising_data_pass) { size += DENOISING_PASS_SIZE_BASE; if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN; + if(denoising_prefiltered_pass) size += DENOISING_PASS_SIZE_PREFILTERED; } return align_up(size, 4); @@ -88,6 +90,20 @@ int BufferParams::get_denoising_offset() return offset; } +int BufferParams::get_denoising_prefiltered_offset() +{ + assert(denoising_prefiltered_pass); + + int offset = get_denoising_offset(); + + offset += DENOISING_PASS_SIZE_BASE; + if(denoising_clean_pass) { + offset += DENOISING_PASS_SIZE_CLEAN; + } + + return offset; +} + /* Render Buffer Task */ RenderTile::RenderTile() @@ -153,81 +169,62 @@ bool RenderBuffers::get_denoising_pass_rect(int type, float exposure, int sample return false; } - float invsample = 1.0f/sample; - float scale = invsample; - bool variance = (type == DENOISING_PASS_NORMAL_VAR) || - (type == DENOISING_PASS_ALBEDO_VAR) || - (type == DENOISING_PASS_DEPTH_VAR) || - (type == DENOISING_PASS_COLOR_VAR); + float scale = 1.0f; + float alpha_scale = 1.0f/sample; + if(type == DENOISING_PASS_PREFILTERED_COLOR || + type == DENOISING_PASS_CLEAN || + type == DENOISING_PASS_PREFILTERED_INTENSITY) { + scale *= exposure; + } + else if(type == DENOISING_PASS_PREFILTERED_VARIANCE) { + scale *= exposure*exposure * (sample - 1); + } - float scale_exposure = scale; - if(type == DENOISING_PASS_COLOR || type == DENOISING_PASS_CLEAN) { - scale_exposure *= exposure; + int offset; + if(type == DENOISING_PASS_CLEAN) { + /* The clean pass isn't changed by prefiltering, so we use the original one there. */ + offset = type + params.get_denoising_offset(); } - else if(type == DENOISING_PASS_COLOR_VAR) { - scale_exposure *= exposure*exposure; + else if (type == DENOISING_PASS_PREFILTERED_COLOR && !params.denoising_prefiltered_pass) { + /* If we're not saving the prefiltering result, return the original noisy pass. */ + offset = params.get_denoising_offset() + DENOISING_PASS_COLOR; + scale /= sample; + } + else { + offset = type + params.get_denoising_prefiltered_offset(); } - int offset = type + params.get_denoising_offset(); int pass_stride = params.get_passes_size(); int size = params.width*params.height; - if(variance) { - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - int mean_offset = offset - components; - float *mean = buffer.data() + mean_offset; - float *var = buffer.data() + offset; - assert(mean_offset >= 0); - - if(components == 1) { - for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels++) { - pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure; - } + float *in = buffer.data() + offset; + + if(components == 1) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + pixels[0] = in[0]*scale; } - else if(components == 3) { - for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels += 3) { - pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure; - pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale_exposure; - pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale_exposure; - } + } + else if(components == 3) { + for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { + pixels[0] = in[0]*scale; + pixels[1] = in[1]*scale; + pixels[2] = in[2]*scale; } - else { - return false; + } + else if(components == 4) { + /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */ + assert(params.passes[0].type == PASS_COMBINED); + float *in_combined = buffer.data(); + + for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) { + pixels[0] = in[0]*scale; + pixels[1] = in[1]*scale; + pixels[2] = in[2]*scale; + pixels[3] = saturate(in_combined[3]*alpha_scale); } } else { - float *in = buffer.data() + offset; - - if(components == 1) { - for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - pixels[0] = in[0]*scale_exposure; - } - } - else if(components == 3) { - for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - pixels[0] = in[0]*scale_exposure; - pixels[1] = in[1]*scale_exposure; - pixels[2] = in[2]*scale_exposure; - } - } - else if(components == 4) { - assert(type == DENOISING_PASS_COLOR); - - /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */ - assert(params.passes[0].type == PASS_COMBINED); - float *in_combined = buffer.data(); - - for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) { - pixels[0] = in[0]*scale_exposure; - pixels[1] = in[1]*scale_exposure; - pixels[2] = in[2]*scale_exposure; - pixels[3] = saturate(in_combined[3]*scale); - } - } - else { - return false; - } + return false; } return true; diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index 46c3b89bd84..0a010718d6d 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -54,6 +54,10 @@ public: bool denoising_data_pass; /* If only some light path types should be denoised, an additional pass is needed. */ bool denoising_clean_pass; + /* When we're prefiltering the passes during rendering, we need to keep both the + * original and the prefiltered data around because neighboring tiles might still + * need the original data. */ + bool denoising_prefiltered_pass; /* functions */ BufferParams(); @@ -63,6 +67,7 @@ public: void add_pass(PassType type); int get_passes_size(); int get_denoising_offset(); + int get_denoising_prefiltered_offset(); }; /* Render Buffers */ diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 355294e23b5..1d1668a20d1 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -286,6 +286,7 @@ NODE_DEFINE(Film) SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false); SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false); + SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false); SOCKET_INT(denoising_flags, "Denoising Flags", 0); return type; @@ -478,6 +479,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN; kfilm->use_light_pass = 1; } + if(denoising_prefiltered_pass) { + kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED; + } } kfilm->pass_stride = align_up(kfilm->pass_stride, 4); diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index c597db4e4c5..8330a4cf413 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -60,6 +60,7 @@ public: vector<Pass> passes; bool denoising_data_pass; bool denoising_clean_pass; + bool denoising_prefiltered_pass; int denoising_flags; float pass_alpha_threshold; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 3c6e88d4879..4b8630af6d1 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -689,7 +689,7 @@ DeviceRequestedFeatures Session::get_requested_device_features() BakeManager *bake_manager = scene->bake_manager; requested_features.use_baking = bake_manager->get_baking(); requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH); - if(params.denoising_passes) { + if(params.run_denoising) { requested_features.use_denoising = true; requested_features.use_shadow_tricks = true; } @@ -927,7 +927,7 @@ void Session::update_status_time(bool show_pause, bool show_done) */ substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples); } - if(params.use_denoising) { + if(params.run_denoising) { substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles()); } } @@ -975,7 +975,7 @@ void Session::render() task.requested_tile_size = params.tile_size; task.passes_size = tile_manager.params.get_passes_size(); - if(params.use_denoising) { + if(params.run_denoising) { task.denoising_radius = params.denoising_radius; task.denoising_strength = params.denoising_strength; task.denoising_feature_strength = params.denoising_feature_strength; @@ -983,8 +983,13 @@ void Session::render() assert(!scene->film->need_update); task.pass_stride = scene->film->pass_stride; + task.target_pass_stride = task.pass_stride; task.pass_denoising_data = scene->film->denoising_data_offset; task.pass_denoising_clean = scene->film->denoising_clean_offset; + + task.denoising_from_render = true; + task.denoising_do_filter = params.full_denoising; + task.denoising_write_passes = params.write_denoising_passes; } device->task_add(task); diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index c7f590915e7..cb1d8fed68f 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -60,8 +60,9 @@ public: bool display_buffer_linear; - bool use_denoising; - bool denoising_passes; + bool run_denoising; + bool write_denoising_passes; + bool full_denoising; int denoising_radius; float denoising_strength; float denoising_feature_strength; @@ -94,8 +95,9 @@ public: use_profiling = false; - use_denoising = false; - denoising_passes = false; + run_denoising = false; + write_denoising_passes = false; + full_denoising = false; denoising_radius = 8; denoising_strength = 0.0f; denoising_feature_strength = 0.0f; |