Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLukas Stockner <lukas.stockner@freenet.de>2019-02-06 14:42:10 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2019-02-06 17:18:29 +0300
commit405cacd4cd955552e1f7b50a176ddcdd9baf8d3b (patch)
treee54e2bf0c79bcc04d669088393b1d16df554bffd
parent81159e99b819910b72cb3caba6b3cd4f35184ea9 (diff)
Cycles: prefilter feature passes separate from denoising.
Prefiltering of feature passes will happen during rendering, which can then be used for denoising immediately or written as a render pass for later (animation) denoising. The number of denoising data passes written is reduced because of this, leaving out the feature variance passes. The passes are now Normal, Albedo, Depth, Shadowing, Variance and Intensity. Ref D3889.
-rw-r--r--intern/cycles/blender/addon/engine.py9
-rw-r--r--intern/cycles/blender/blender_session.cpp17
-rw-r--r--intern/cycles/blender/blender_sync.cpp35
-rw-r--r--intern/cycles/device/device_cpu.cpp71
-rw-r--r--intern/cycles/device/device_cuda.cpp169
-rw-r--r--intern/cycles/device/device_denoising.cpp72
-rw-r--r--intern/cycles/device/device_denoising.h22
-rw-r--r--intern/cycles/device/device_task.h6
-rw-r--r--intern/cycles/device/opencl/opencl.h15
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp71
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h1
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h19
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h36
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h39
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h12
-rw-r--r--intern/cycles/kernel/kernel_types.h9
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h14
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h56
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu36
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl34
-rw-r--r--intern/cycles/render/buffers.cpp123
-rw-r--r--intern/cycles/render/buffers.h5
-rw-r--r--intern/cycles/render/film.cpp4
-rw-r--r--intern/cycles/render/film.h1
-rw-r--r--intern/cycles/render/session.cpp11
-rw-r--r--intern/cycles/render/session.h10
26 files changed, 644 insertions, 253 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 23239ee4352..83b9a8eee0c 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -269,14 +269,11 @@ def register_passes(engine, scene, srl):
engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR')
if crl.denoising_store_passes:
engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR')
- engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR')
- engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR')
engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE')
- engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE')
- engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR')
- engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR')
- engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR')
+ engine.register_pass(scene, srl, "Denoising Shadowing", 1, "X", 'VALUE')
+ engine.register_pass(scene, srl, "Denoising Variance", 3, "RGB", 'COLOR')
+ engine.register_pass(scene, srl, "Denoising Intensity", 1, "X", 'VALUE')
clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
"denoising_glossy_direct", "denoising_glossy_indirect",
"denoising_transmission_direct", "denoising_transmission_indirect",
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index dfa92dd1bc7..50ac35069a9 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -418,15 +418,19 @@ void BlenderSession::render()
buffer_params.passes = passes;
PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
- bool use_denoising = get_boolean(crl, "use_denoising");
- bool denoising_passes = use_denoising || get_boolean(crl, "denoising_store_passes");
+ bool full_denoising = get_boolean(crl, "use_denoising");
+ bool write_denoising_passes = get_boolean(crl, "denoising_store_passes");
- session->tile_manager.schedule_denoising = use_denoising;
- buffer_params.denoising_data_pass = denoising_passes;
+ bool run_denoising = full_denoising || write_denoising_passes;
+
+ session->tile_manager.schedule_denoising = run_denoising;
+ buffer_params.denoising_data_pass = run_denoising;
buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
+ buffer_params.denoising_prefiltered_pass = write_denoising_passes;
- session->params.use_denoising = use_denoising;
- session->params.denoising_passes = denoising_passes;
+ session->params.run_denoising = run_denoising;
+ session->params.full_denoising = full_denoising;
+ session->params.write_denoising_passes = write_denoising_passes;
session->params.denoising_radius = get_int(crl, "denoising_radius");
session->params.denoising_strength = get_float(crl, "denoising_strength");
session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
@@ -434,6 +438,7 @@ void BlenderSession::render()
scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass;
+ scene->film->denoising_prefiltered_pass = buffer_params.denoising_prefiltered_pass;
scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
scene->film->tag_passes_update(scene, passes);
scene->film->tag_update(scene);
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 703fcc2078b..a6050b66040 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -531,7 +531,7 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
{
string name = b_pass.name();
- if(name == "Noisy Image") return DENOISING_PASS_COLOR;
+ if(name == "Noisy Image") return DENOISING_PASS_PREFILTERED_COLOR;
if(name.substr(0, 10) != "Denoising ") {
return -1;
@@ -539,15 +539,12 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
name = name.substr(10);
#define MAP_PASS(passname, offset) if(name == passname) return offset;
- MAP_PASS("Normal", DENOISING_PASS_NORMAL);
- MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
- MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
- MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
- MAP_PASS("Depth", DENOISING_PASS_DEPTH);
- MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
- MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
- MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
- MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
+ MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL);
+ MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO);
+ MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH);
+ MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING);
+ MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE);
+ MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY);
MAP_PASS("Clean", DENOISING_PASS_CLEAN);
#undef MAP_PASS
@@ -579,10 +576,11 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
}
PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
- bool use_denoising = get_boolean(crp, "use_denoising");
- bool store_denoising_passes = get_boolean(crp, "denoising_store_passes");
+ bool full_denoising = get_boolean(crp, "use_denoising");
+ bool write_denoising_passes = get_boolean(crp, "denoising_store_passes");
+
scene->film->denoising_flags = 0;
- if(use_denoising || store_denoising_passes) {
+ if(full_denoising || write_denoising_passes) {
#define MAP_OPTION(name, flag) if(!get_boolean(crp, name)) scene->film->denoising_flags |= flag;
MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
@@ -596,16 +594,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
b_engine.add_pass("Noisy Image", 4, "RGBA", b_srlay.name().c_str());
}
- if(store_denoising_passes) {
+ if(write_denoising_passes) {
b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Shadowing", 1, "X", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Variance", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Intensity", 1, "X", b_srlay.name().c_str());
if(scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) {
b_engine.add_pass("Denoising Clean", 3, "RGB", b_srlay.name().c_str());
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 16908b0244a..6668acc9cbe 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -180,16 +180,17 @@ public:
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel;
- KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
- KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
+ KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)> filter_write_feature_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
@@ -218,6 +219,7 @@ public:
REGISTER_KERNEL(shader),
REGISTER_KERNEL(filter_divide_shadow),
REGISTER_KERNEL(filter_get_feature),
+ REGISTER_KERNEL(filter_write_feature),
REGISTER_KERNEL(filter_detect_outliers),
REGISTER_KERNEL(filter_combine_halves),
REGISTER_KERNEL(filter_nlm_calc_difference),
@@ -487,6 +489,8 @@ public:
int w = align_up(rect.z-rect.x, 4);
int h = rect.w-rect.y;
+ int stride = task->buffer.stride;
+ int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
float *blurDifference = temporary_mem;
@@ -504,9 +508,10 @@ public:
filter_nlm_calc_difference_kernel()(dx, dy,
(float*) guide_ptr,
(float*) variance_ptr,
+ NULL,
difference,
local_rect,
- w, 0,
+ w, channel_offset,
a, k_2);
filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
@@ -520,7 +525,8 @@ public:
(float*) out_ptr,
weightAccum,
local_rect,
- w, f);
+ channel_offset,
+ stride, f);
}
int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
@@ -550,16 +556,13 @@ public:
return true;
}
- bool denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ DenoisingTask *task)
{
ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
- mem_zero(task->storage.XtWX);
- mem_zero(task->storage.XtWY);
-
float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
float *difference = temporary_mem;
float *blurDifference = temporary_mem + task->buffer.pass_stride;
@@ -575,6 +578,7 @@ public:
filter_nlm_calc_difference_kernel()(dx, dy,
(float*) color_ptr,
(float*) color_variance_ptr,
+ (float*) scale_ptr,
difference,
local_rect,
task->buffer.stride,
@@ -597,6 +601,13 @@ public:
4,
task->buffer.pass_stride);
}
+
+ return true;
+ }
+
+ bool denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task)
+ {
for(int y = 0; y < task->filter_area.w; y++) {
for(int x = 0; x < task->filter_area.z; x++) {
filter_finalize_kernel()(x,
@@ -661,6 +672,7 @@ public:
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task)
{
ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
@@ -674,6 +686,7 @@ public:
x, y,
(float*) mean_ptr,
(float*) variance_ptr,
+ scale,
&task->rect.x,
task->render_buffer.pass_stride,
task->render_buffer.offset);
@@ -682,6 +695,26 @@ public:
return true;
}
+ bool denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+ {
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_write_feature_kernel()(task->render_buffer.samples,
+ x + task->filter_area.x,
+ y + task->filter_area.y,
+ &task->reconstruction_state.buffer_params.x,
+ (float*) from_ptr,
+ (float*) buffer_ptr,
+ out_offset,
+ &task->rect.x);
+ }
+ }
+ return true;
+ }
+
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
@@ -754,11 +787,13 @@ public:
tile.sample = tile.start_sample + tile.num_samples;
denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 7b3c25a86d5..cb7d8bbb224 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1300,7 +1300,7 @@ public:
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
- int channel_offset = 0;
+ int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
if(have_error())
return false;
@@ -1308,6 +1308,7 @@ public:
CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts;
+ CUdeviceptr scale_ptr = 0;
cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride));
cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride));
@@ -1326,10 +1327,10 @@ public:
CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
- void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
+ void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &r, &f};
+ void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1379,19 +1380,16 @@ public:
return !have_error();
}
- bool denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ DenoisingTask *task)
{
if(have_error())
return false;
CUDAContextScope scope(this);
- mem_zero(task->storage.XtWX);
- mem_zero(task->storage.XtWY);
-
int r = task->radius;
int f = 4;
float a = 1.0f;
@@ -1410,60 +1408,69 @@ public:
CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
- {
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
- cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
- CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
- task->reconstruction_state.source_w * task->reconstruction_state.source_h,
- num_shifts);
-
- void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2};
- void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
- void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *construct_gramian_args[] = {&blurDifference,
- &task->buffer.mem.device_pointer,
- &task->storage.transform.device_pointer,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->reconstruction_state.filter_window,
- &w, &h, &stride,
- &pass_stride, &r,
- &f};
-
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
- }
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+ CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+ task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+ num_shifts);
+
+ void *calc_difference_args[] = {&color_ptr,
+ &color_variance_ptr,
+ &scale_ptr,
+ &difference,
+ &w, &h,
+ &stride, &pass_stride,
+ &r, &pass_stride,
+ &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *construct_gramian_args[] = {&blurDifference,
+ &task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.filter_window,
+ &w, &h, &stride,
+ &pass_stride, &r,
+ &f};
+
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+ cuda_assert(cuCtxSynchronize());
- {
- CUfunction cuFinalize;
- cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
- cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
- void *finalize_args[] = {&output_ptr,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->filter_area,
- &task->reconstruction_state.buffer_params.x,
- &task->render_buffer.samples};
- CUDA_GET_BLOCKSIZE(cuFinalize,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
- CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
- }
+ return !have_error();
+ }
+ bool denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ CUfunction cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+ void *finalize_args[] = {&output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_GET_BLOCKSIZE(cuFinalize,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
cuda_assert(cuCtxSynchronize());
return !have_error();
@@ -1533,6 +1540,7 @@ public:
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task)
{
if(have_error())
@@ -1553,6 +1561,7 @@ public:
&variance_offset,
&mean_ptr,
&variance_ptr,
+ &scale,
&task->rect,
&task->render_buffer.pass_stride,
&task->render_buffer.offset};
@@ -1562,6 +1571,36 @@ public:
return !have_error();
}
+ bool denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterWriteFeature;
+ cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterWriteFeature,
+ task->filter_area.z,
+ task->filter_area.w);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->reconstruction_state.buffer_params,
+ &task->filter_area,
+ &from_ptr,
+ &buffer_ptr,
+ &out_offset,
+ &task->rect};
+ CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
@@ -1596,11 +1635,13 @@ public:
void denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
- denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 433cbd3c265..724171c3acb 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -39,11 +39,18 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
render_buffer.pass_stride = task.pass_stride;
render_buffer.offset = task.pass_denoising_data;
- target_buffer.pass_stride = task.pass_stride;
+ target_buffer.pass_stride = task.target_pass_stride;
target_buffer.denoising_clean_offset = task.pass_denoising_clean;
+ target_buffer.offset = 0;
functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
+
+ tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
+ tile_info->from_render = task.denoising_from_render? 1 : 0;
+
+ write_passes = task.denoising_write_passes;
+ do_filter = task.denoising_do_filter;
}
DenoisingTask::~DenoisingTask()
@@ -59,8 +66,6 @@ DenoisingTask::~DenoisingTask()
void DenoisingTask::set_render_buffer(RenderTile *rtiles)
{
- tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
-
for(int i = 0; i < 9; i++) {
tile_info->offsets[i] = rtiles[i].offset;
tile_info->strides[i] = rtiles[i].stride;
@@ -79,6 +84,13 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
target_buffer.stride = rtiles[9].stride;
target_buffer.ptr = rtiles[9].buffer;
+ if(write_passes && rtiles[9].buffers) {
+ target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset();
+ }
+ else {
+ target_buffer.denoising_output_offset = 0;
+ }
+
tile_info_mem.copy_to_device();
}
@@ -89,7 +101,8 @@ void DenoisingTask::setup_denoising_buffer()
rect = rect_expand(rect, radius);
rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
- buffer.passes = 14;
+ buffer.use_intensity = write_passes;
+ buffer.passes = buffer.use_intensity? 15 : 14;
buffer.width = rect.z - rect.x;
buffer.stride = align_up(buffer.width, 4);
buffer.h = rect.w - rect.y;
@@ -129,14 +142,14 @@ void DenoisingTask::prefilter_shadowing()
functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+ nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
/* Reuse memory, the previous data isn't needed anymore. */
device_ptr filtered_a = *buffer_var,
filtered_b = *sample_var;
/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
- nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+ nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
@@ -147,7 +160,7 @@ void DenoisingTask::prefilter_shadowing()
device_ptr final_a = *unfiltered_a,
final_b = *unfiltered_b;
/* Use the residual variance for a second filter pass. */
- nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+ nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
@@ -167,9 +180,9 @@ void DenoisingTask::prefilter_features()
for(int pass = 0; pass < 7; pass++) {
device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride);
/* Get the unfiltered pass and its variance from the RenderBuffers. */
- functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+ functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples);
/* Smooth the pass and store the result in the denoising buffers. */
- nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+ nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
}
}
@@ -188,13 +201,33 @@ void DenoisingTask::prefilter_color()
for(int pass = 0; pass < num_color_passes; pass++) {
device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride);
device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride);
- functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+ functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples);
}
device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride);
device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+
+ if(buffer.use_intensity) {
+ device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
+ nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true);
+ functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
+ }
+}
+
+void DenoisingTask::write_buffer()
+{
+ reconstruction_state.buffer_params = make_int4(target_buffer.offset,
+ target_buffer.stride,
+ target_buffer.pass_stride,
+ target_buffer.denoising_clean_offset);
+ int num_passes = buffer.use_intensity? 15 : 14;
+ for(int pass = 0; pass < num_passes; pass++) {
+ device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride);
+ int out_offset = pass + target_buffer.denoising_output_offset;
+ functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
+ }
}
void DenoisingTask::construct_transform()
@@ -212,6 +245,8 @@ void DenoisingTask::reconstruct()
{
storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
+ storage.XtWX.zero_to_device();
+ storage.XtWY.zero_to_device();
reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x;
@@ -224,7 +259,12 @@ void DenoisingTask::reconstruct()
device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride);
device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
- functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr);
+
+ device_ptr scale_ptr = 0;
+ device_sub_ptr *scale_sub_ptr = NULL;
+ functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr);
+ delete scale_sub_ptr;
+ functions.solve(target_buffer.ptr);
}
void DenoisingTask::run_denoising(RenderTile *tile)
@@ -240,8 +280,14 @@ void DenoisingTask::run_denoising(RenderTile *tile)
prefilter_features();
prefilter_color();
- construct_transform();
- reconstruct();
+ if(do_filter) {
+ construct_transform();
+ reconstruct();
+ }
+
+ if(write_passes) {
+ write_buffer();
+ }
functions.unmap_neighbor_tiles(rtiles);
}
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index beae60c220f..cddcd3bd0c9 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -47,6 +47,7 @@ public:
int stride;
int pass_stride;
int denoising_clean_offset;
+ int denoising_output_offset;
device_ptr ptr;
} target_buffer;
@@ -58,6 +59,9 @@ public:
int4 rect;
int4 filter_area;
+ bool write_passes;
+ bool do_filter;
+
struct DeviceFunctions {
function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
@@ -66,8 +70,9 @@ public:
)> non_local_means;
function<bool(device_ptr color_ptr,
device_ptr color_variance_ptr,
- device_ptr output_ptr
- )> reconstruct;
+ device_ptr scale_ptr
+ )> accumulate;
+ function<bool(device_ptr output_ptr)> solve;
function<bool()> construct_transform;
function<bool(device_ptr a_ptr,
@@ -86,13 +91,18 @@ public:
function<bool(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
- device_ptr variance_ptr
+ device_ptr variance_ptr,
+ float scale
)> get_feature;
function<bool(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
device_ptr output_ptr
)> detect_outliers;
+ function<bool(int out_offset,
+ device_ptr frop_ptr,
+ device_ptr buffer_ptr
+ )> write_feature;
function<void(RenderTile *rtiles)> map_neighbor_tiles;
function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
} functions;
@@ -114,8 +124,9 @@ public:
int f; /* Patch size of the filter. */
float a; /* Variance compensation factor in the MSE estimation. */
float k_2; /* Squared value of the k parameter of the filter. */
+ bool is_color;
- void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+ void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; }
} nlm_state;
struct Storage {
@@ -147,6 +158,7 @@ public:
int width;
device_only_memory<float> mem;
device_only_memory<float> temporary_mem;
+ bool use_intensity;
bool gpu_temporary_mem;
@@ -166,6 +178,8 @@ protected:
void prefilter_color();
void construct_transform();
void reconstruct();
+
+ void write_buffer();
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 861014373b3..97bcde99af6 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -72,7 +72,13 @@ public:
float denoising_strength;
float denoising_feature_strength;
bool denoising_relative_pca;
+ bool denoising_from_render;
+
+ bool denoising_do_filter;
+ bool denoising_write_passes;
+
int pass_stride;
+ int target_pass_stride;
int pass_denoising_data;
int pass_denoising_clean;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index ea7ed4f1909..4d42ddc0c53 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -419,10 +419,12 @@ protected:
device_ptr out_ptr,
DenoisingTask *task);
bool denoising_construct_transform(DenoisingTask *task);
- bool denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task);
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ DenoisingTask *task);
+ bool denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task);
bool denoising_combine_halves(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr mean_ptr,
@@ -439,7 +441,12 @@ protected:
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task);
+ bool denoising_write_feature(int to_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task);
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index d4d7c0f74bc..a0a1cf68c32 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -748,6 +748,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
+ int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
@@ -760,6 +761,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
cl_mem out_mem = CL_MEM_PTR(out_ptr);
+ cl_mem scale_mem = NULL;
mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride);
mem_zero_kernel(out_ptr, sizeof(float)*pass_stride);
@@ -773,10 +775,12 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
kernel_set_args(ckNLMCalcDifference, 0,
guide_mem,
variance_mem,
+ scale_mem,
difference_mem,
w, h, stride,
pass_stride,
- r, 0, a, k_2);
+ r, channel_offset,
+ 0, a, k_2);
kernel_set_args(ckNLMBlur, 0,
difference_mem,
blurDifference_mem,
@@ -796,6 +800,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
weightAccum_mem,
w, h, stride,
pass_stride,
+ channel_offset,
r, f);
enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
@@ -837,17 +842,14 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
return true;
}
-bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
+bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ DenoisingTask *task)
{
- mem_zero(task->storage.XtWX);
- mem_zero(task->storage.XtWY);
-
cl_mem color_mem = CL_MEM_PTR(color_ptr);
cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
- cl_mem output_mem = CL_MEM_PTR(output_ptr);
+ cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
@@ -859,7 +861,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
- cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
int w = task->reconstruction_state.source_w;
int h = task->reconstruction_state.source_h;
@@ -877,6 +878,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
kernel_set_args(ckNLMCalcDifference, 0,
color_mem,
color_variance_mem,
+ scale_mem,
difference_mem,
w, h, stride,
pass_stride,
@@ -913,6 +915,22 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task)
+{
+ cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+ cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+ int w = task->reconstruction_state.source_w;
+ int h = task->reconstruction_state.source_h;
+
kernel_set_args(ckFinalize, 0,
output_mem,
rank_mem,
@@ -1000,6 +1018,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task)
{
cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
@@ -1023,6 +1042,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
variance_offset,
mean_mem,
variance_mem,
+ scale,
task->rect,
task->render_buffer.pass_stride,
task->render_buffer.offset);
@@ -1033,6 +1053,31 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
return true;
}
+bool OpenCLDeviceBase::denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+{
+ cl_mem from_mem = CL_MEM_PTR(from_ptr);
+ cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
+
+ cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
+
+ kernel_set_args(ckFilterWriteFeature, 0,
+ task->render_buffer.samples,
+ task->reconstruction_state.buffer_params,
+ task->filter_area,
+ from_mem,
+ buffer_mem,
+ out_offset,
+ task->rect);
+ enqueue_kernel(ckFilterWriteFeature,
+ task->filter_area.z,
+ task->filter_area.w);
+
+ return true;
+}
+
bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
@@ -1063,11 +1108,13 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
- denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(&OpenCLDeviceBase::denoising_write_feature, this, _1, _2, _3, &denoising);
denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index 67f4e62ac0f..9ac7c3db23d 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -27,6 +27,7 @@ typedef struct TileInfo {
int strides[9];
int x[4];
int y[4];
+ int from_render;
/* TODO(lukas): CUDA doesn't have uint64_t... */
#ifdef __KERNEL_OPENCL__
ccl_global float *buffers[9];
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index af73c0dadf2..0c4387af540 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
const float *ccl_restrict weight_image,
const float *ccl_restrict variance_image,
+ const float *ccl_restrict scale_image,
float *difference_image,
int4 rect,
int stride,
@@ -41,13 +42,21 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
int idx_q = (y+dy)*stride + aligned_lowx + dx;
for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
float4 diff = make_float4(0.0f);
+ float4 scale_fac;
+ if(scale_image) {
+ scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
+ make_float4(0.25f), make_float4(4.0f));
+ }
+ else {
+ scale_fac = make_float4(1.0f);
+ }
for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
/* idx_p is guaranteed to be aligned, but idx_q isn't. */
float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
- float4 color_q = load4_u(weight_image, idx_q + chan_ofs);
+ float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs);
float4 cdiff = color_p - color_q;
float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
- float4 var_q = load4_u(variance_image, idx_q + chan_ofs);
+ float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs);
diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
}
load4_a(difference_image, idx_p) = diff*channel_fac;
@@ -143,6 +152,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
float *out_image,
float *accum_image,
int4 rect,
+ int channel_offset,
int stride,
int f)
{
@@ -160,6 +170,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
load4_a(accum_image, idx_p) += mask(active, weight);
float4 val = load4_u(image, idx_q);
+ if(channel_offset) {
+ val += load4_u(image, idx_q + channel_offset);
+ val += load4_u(image, idx_q + 2*channel_offset);
+ val *= 1.0f/3.0f;
+ }
load4_a(out_image, idx_p) += mask(active, weight*val);
}
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 058afb34a92..d8e2e4d08aa 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -78,17 +78,25 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
int dx, int dy,
const ccl_global float *ccl_restrict weight_image,
const ccl_global float *ccl_restrict variance_image,
+ const ccl_global float *ccl_restrict scale_image,
ccl_global float *difference_image,
int4 rect, int stride,
int channel_offset,
float a, float k_2)
{
- float diff = 0.0f;
+ int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
int numChannels = channel_offset? 3 : 1;
- for(int c = 0; c < numChannels; c++) {
- float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
- float pvar = variance_image[c*channel_offset + y*stride + x];
- float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+
+ float diff = 0.0f;
+ float scale_fac = 1.0f;
+ if(scale_image) {
+ scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
+ }
+
+ for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
+ float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q];
+ float pvar = variance_image[idx_p];
+ float qvar = sqr(scale_fac)*variance_image[idx_q];
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
}
if(numChannels > 1) {
@@ -133,7 +141,8 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
const ccl_global float *ccl_restrict image,
ccl_global float *out_image,
ccl_global float *accum_image,
- int4 rect, int stride, int f)
+ int4 rect, int channel_offset,
+ int stride, int f)
{
float sum = 0.0f;
const int low = max(rect.x, x-f);
@@ -142,12 +151,21 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
sum += difference_image[y*stride + x1];
}
sum *= 1.0f/(high-low);
+
+ int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
if(out_image) {
- atomic_add_and_fetch_float(accum_image + y*stride + x, sum);
- atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]);
+ atomic_add_and_fetch_float(accum_image + idx_p, sum);
+
+ float val = image[idx_q];
+ if(channel_offset) {
+ val += image[idx_q + channel_offset];
+ val += image[idx_q + 2*channel_offset];
+ val *= 1.0f/3.0f;
+ }
+ atomic_add_and_fetch_float(out_image + idx_p, sum*val);
}
else {
- accum_image[y*stride + x] = sum;
+ accum_image[idx_p] = sum;
}
}
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index 3507f80df46..41be4dbea49 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -84,6 +84,7 @@ ccl_device void kernel_filter_get_feature(int sample,
int x, int y,
ccl_global float *mean,
ccl_global float *variance,
+ float scale,
int4 rect, int buffer_pass_stride,
int buffer_denoising_offset)
{
@@ -95,18 +96,38 @@ ccl_device void kernel_filter_get_feature(int sample,
int buffer_w = align_up(rect.z - rect.x, 4);
int idx = (y-rect.y)*buffer_w + (x - rect.x);
- mean[idx] = center_buffer[m_offset] / sample;
- if(sample > 1) {
- /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
- * update does not work efficiently with atomics in the kernel. */
- variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
- }
- else {
- /* Can't compute variance with single sample, just set it very high. */
- variance[idx] = 1e10f;
+ float val = scale * center_buffer[m_offset];
+ mean[idx] = val;
+
+ if(v_offset >= 0) {
+ if(sample > 1) {
+ /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+ * update does not work efficiently with atomics in the kernel. */
+ variance[idx] = max(0.0f, (center_buffer[v_offset] - val*val*sample) / (sample * (sample-1)));
+ }
+ else {
+ /* Can't compute variance with single sample, just set it very high. */
+ variance[idx] = 1e10f;
+ }
}
}
+ccl_device void kernel_filter_write_feature(int sample,
+ int x, int y,
+ int4 buffer_params,
+ ccl_global float *from,
+ ccl_global float *buffer,
+ int out_offset,
+ int4 rect)
+{
+ ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ combined_buffer[out_offset] = from[idx];
+}
+
ccl_device void kernel_filter_detect_outliers(int x, int y,
ccl_global float *image,
ccl_global float *variance,
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index 58740d5b06a..e5d3b0da835 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -108,11 +108,13 @@ ccl_device_inline void kernel_filter_finalize(int x, int y,
final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
- final_color *= sample;
- if(buffer_params.w) {
- final_color.x += combined_buffer[buffer_params.w+0];
- final_color.y += combined_buffer[buffer_params.w+1];
- final_color.z += combined_buffer[buffer_params.w+2];
+ if(buffer_params.w >= 0) {
+ final_color *= sample;
+ if(buffer_params.w > 0) {
+ final_color.x += combined_buffer[buffer_params.w+0];
+ final_color.y += combined_buffer[buffer_params.w+1];
+ final_color.z += combined_buffer[buffer_params.w+2];
+ }
}
combined_buffer[0] = final_color.x;
combined_buffer[1] = final_color.y;
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 864aa7c470a..caa0057d997 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -472,8 +472,17 @@ typedef enum DenoisingPassOffsets {
DENOISING_PASS_COLOR_VAR = 23,
DENOISING_PASS_CLEAN = 26,
+ DENOISING_PASS_PREFILTERED_DEPTH = 0,
+ DENOISING_PASS_PREFILTERED_NORMAL = 1,
+ DENOISING_PASS_PREFILTERED_SHADOWING = 4,
+ DENOISING_PASS_PREFILTERED_ALBEDO = 5,
+ DENOISING_PASS_PREFILTERED_COLOR = 8,
+ DENOISING_PASS_PREFILTERED_VARIANCE = 11,
+ DENOISING_PASS_PREFILTERED_INTENSITY = 14,
+
DENOISING_PASS_SIZE_BASE = 26,
DENOISING_PASS_SIZE_CLEAN = 3,
+ DENOISING_PASS_SIZE_PREFILTERED = 15,
} DenoisingPassOffsets;
typedef enum eBakePassFilter {
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index e036b53b810..08333c7a455 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -37,10 +37,20 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
int y,
float *mean,
float *variance,
+ float scale,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset);
+void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
+ int x,
+ int y,
+ int *buffer_params,
+ float *from,
+ float *buffer,
+ int out_offset,
+ int* prefilter_rect);
+
void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
ccl_global float *image,
ccl_global float *variance,
@@ -71,7 +81,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int dy,
float *weight_image,
- float *variance,
+ float *variance_image,
+ float *scale_image,
float *difference_image,
int* rect,
int stride,
@@ -99,6 +110,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
float *out_image,
float *accum_image,
int* rect,
+ int channel_offset,
int stride,
int f);
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 4c758711481..b792367e3ab 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -69,6 +69,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
int x,
int y,
float *mean, float *variance,
+ float scale,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset)
@@ -80,12 +81,29 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
m_offset, v_offset,
x, y,
mean, variance,
+ scale,
load_int4(prefilter_rect),
buffer_pass_stride,
buffer_denoising_offset);
#endif
}
+void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
+ int x,
+ int y,
+ int *buffer_params,
+ float *from,
+ float *buffer,
+ int out_offset,
+ int* prefilter_rect)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_write_feature);
+#else
+ kernel_filter_write_feature(sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect));
+#endif
+}
+
void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
ccl_global float *image,
ccl_global float *variance,
@@ -130,8 +148,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
#else
- rank += storage_ofs;
- transform += storage_ofs*TRANSFORM_SIZE;
+ rank += storage_ofs;
+ transform += storage_ofs*TRANSFORM_SIZE;
kernel_filter_construct_transform(buffer,
x, y,
load_int4(prefilter_rect),
@@ -146,7 +164,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int dy,
float *weight_image,
- float *variance,
+ float *variance_image,
+ float *scale_image,
float *difference_image,
int *rect,
int stride,
@@ -157,7 +176,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
#else
- kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2);
+ kernel_filter_nlm_calc_difference(dx, dy,
+ weight_image,
+ variance_image,
+ scale_image,
+ difference_image,
+ load_int4(rect),
+ stride,
+ channel_offset,
+ a, k_2);
#endif
}
@@ -195,13 +222,22 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
float *out_image,
float *accum_image,
int *rect,
+ int channel_offset,
int stride,
int f)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
#else
- kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f);
+ kernel_filter_nlm_update_output(dx, dy,
+ difference_image,
+ image,
+ temp_image,
+ out_image,
+ accum_image,
+ load_int4(rect),
+ channel_offset,
+ stride, f);
#endif
}
@@ -222,7 +258,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
#else
- kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride);
+ kernel_filter_nlm_construct_gramian(dx, dy,
+ difference_image,
+ buffer,
+ transform, rank,
+ XtWX, XtWY,
+ load_int4(rect),
+ load_int4(filter_window),
+ stride, f,
+ pass_stride);
#endif
}
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index b856cbde45c..3b51bb41aed 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -64,6 +64,7 @@ kernel_cuda_filter_get_feature(int sample,
int v_offset,
float *mean,
float *variance,
+ float scale,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset)
@@ -76,6 +77,7 @@ kernel_cuda_filter_get_feature(int sample,
m_offset, v_offset,
x, y,
mean, variance,
+ scale,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset);
@@ -84,6 +86,30 @@ kernel_cuda_filter_get_feature(int sample,
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_write_feature(int sample,
+ int4 buffer_params,
+ int4 filter_area,
+ float *from,
+ float *buffer,
+ int out_offset,
+ int4 prefilter_rect)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ kernel_filter_write_feature(sample,
+ x + filter_area.x,
+ y + filter_area.y,
+ buffer_params,
+ from,
+ buffer,
+ out_offset,
+ prefilter_rect);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_detect_outliers(float *image,
float *variance,
float *depth,
@@ -136,6 +162,7 @@ extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
const float *ccl_restrict variance_image,
+ const float *ccl_restrict scale_image,
float *difference_image,
int w,
int h,
@@ -152,9 +179,11 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
weight_image,
variance_image,
+ scale_image,
difference_image + ofs,
rect, stride,
- channel_offset, a, k_2);
+ channel_offset,
+ a, k_2);
}
}
@@ -210,6 +239,7 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
int h,
int stride,
int pass_stride,
+ int channel_offset,
int r,
int f)
{
@@ -221,7 +251,9 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
image,
out_image,
accum_image,
- rect, stride, f);
+ rect,
+ channel_offset,
+ stride, f);
}
}
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index a550f97f4eb..8a821ee281d 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -56,6 +56,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
int v_offset,
ccl_global float *mean,
ccl_global float *variance,
+ float scale,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset)
@@ -68,12 +69,35 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
m_offset, v_offset,
x, y,
mean, variance,
+ scale,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset);
}
}
+__kernel void kernel_ocl_filter_write_feature(int sample,
+ int4 buffer_params,
+ int4 filter_area,
+ ccl_global float *from,
+ ccl_global float *buffer,
+ int out_offset,
+ int4 prefilter_rect)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ kernel_filter_write_feature(sample,
+ x + filter_area.x,
+ y + filter_area.y,
+ buffer_params,
+ from,
+ buffer,
+ out_offset,
+ prefilter_rect);
+ }
+}
+
__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
ccl_global float *variance,
ccl_global float *depth,
@@ -128,6 +152,7 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
const ccl_global float *ccl_restrict variance_image,
+ const ccl_global float *ccl_restrict scale_image,
ccl_global float *difference_image,
int w,
int h,
@@ -144,9 +169,11 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
weight_image,
variance_image,
+ scale_image,
difference_image + ofs,
rect, stride,
- channel_offset, a, k_2);
+ channel_offset,
+ a, k_2);
}
}
@@ -196,6 +223,7 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
int h,
int stride,
int pass_stride,
+ int channel_offset,
int r,
int f)
{
@@ -207,7 +235,9 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
image,
out_image,
accum_image,
- rect, stride, f);
+ rect,
+ channel_offset,
+ stride, f);
}
}
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index f901885e679..66b8ef73acc 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -42,6 +42,7 @@ BufferParams::BufferParams()
denoising_data_pass = false;
denoising_clean_pass = false;
+ denoising_prefiltered_pass = false;
Pass::add(PASS_COMBINED, passes);
}
@@ -73,6 +74,7 @@ int BufferParams::get_passes_size()
if(denoising_data_pass) {
size += DENOISING_PASS_SIZE_BASE;
if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
+ if(denoising_prefiltered_pass) size += DENOISING_PASS_SIZE_PREFILTERED;
}
return align_up(size, 4);
@@ -88,6 +90,20 @@ int BufferParams::get_denoising_offset()
return offset;
}
+int BufferParams::get_denoising_prefiltered_offset()
+{
+ assert(denoising_prefiltered_pass);
+
+ int offset = get_denoising_offset();
+
+ offset += DENOISING_PASS_SIZE_BASE;
+ if(denoising_clean_pass) {
+ offset += DENOISING_PASS_SIZE_CLEAN;
+ }
+
+ return offset;
+}
+
/* Render Buffer Task */
RenderTile::RenderTile()
@@ -153,81 +169,62 @@ bool RenderBuffers::get_denoising_pass_rect(int type, float exposure, int sample
return false;
}
- float invsample = 1.0f/sample;
- float scale = invsample;
- bool variance = (type == DENOISING_PASS_NORMAL_VAR) ||
- (type == DENOISING_PASS_ALBEDO_VAR) ||
- (type == DENOISING_PASS_DEPTH_VAR) ||
- (type == DENOISING_PASS_COLOR_VAR);
+ float scale = 1.0f;
+ float alpha_scale = 1.0f/sample;
+ if(type == DENOISING_PASS_PREFILTERED_COLOR ||
+ type == DENOISING_PASS_CLEAN ||
+ type == DENOISING_PASS_PREFILTERED_INTENSITY) {
+ scale *= exposure;
+ }
+ else if(type == DENOISING_PASS_PREFILTERED_VARIANCE) {
+ scale *= exposure*exposure * (sample - 1);
+ }
- float scale_exposure = scale;
- if(type == DENOISING_PASS_COLOR || type == DENOISING_PASS_CLEAN) {
- scale_exposure *= exposure;
+ int offset;
+ if(type == DENOISING_PASS_CLEAN) {
+ /* The clean pass isn't changed by prefiltering, so we use the original one there. */
+ offset = type + params.get_denoising_offset();
}
- else if(type == DENOISING_PASS_COLOR_VAR) {
- scale_exposure *= exposure*exposure;
+ else if (type == DENOISING_PASS_PREFILTERED_COLOR && !params.denoising_prefiltered_pass) {
+ /* If we're not saving the prefiltering result, return the original noisy pass. */
+ offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
+ scale /= sample;
+ }
+ else {
+ offset = type + params.get_denoising_prefiltered_offset();
}
- int offset = type + params.get_denoising_offset();
int pass_stride = params.get_passes_size();
int size = params.width*params.height;
- if(variance) {
- /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
- * update does not work efficiently with atomics in the kernel. */
- int mean_offset = offset - components;
- float *mean = buffer.data() + mean_offset;
- float *var = buffer.data() + offset;
- assert(mean_offset >= 0);
-
- if(components == 1) {
- for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels++) {
- pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure;
- }
+ float *in = buffer.data() + offset;
+
+ if(components == 1) {
+ for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+ pixels[0] = in[0]*scale;
}
- else if(components == 3) {
- for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels += 3) {
- pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure;
- pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale_exposure;
- pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale_exposure;
- }
+ }
+ else if(components == 3) {
+ for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+ pixels[0] = in[0]*scale;
+ pixels[1] = in[1]*scale;
+ pixels[2] = in[2]*scale;
}
- else {
- return false;
+ }
+ else if(components == 4) {
+ /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
+ assert(params.passes[0].type == PASS_COMBINED);
+ float *in_combined = buffer.data();
+
+ for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
+ pixels[0] = in[0]*scale;
+ pixels[1] = in[1]*scale;
+ pixels[2] = in[2]*scale;
+ pixels[3] = saturate(in_combined[3]*alpha_scale);
}
}
else {
- float *in = buffer.data() + offset;
-
- if(components == 1) {
- for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
- pixels[0] = in[0]*scale_exposure;
- }
- }
- else if(components == 3) {
- for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
- pixels[0] = in[0]*scale_exposure;
- pixels[1] = in[1]*scale_exposure;
- pixels[2] = in[2]*scale_exposure;
- }
- }
- else if(components == 4) {
- assert(type == DENOISING_PASS_COLOR);
-
- /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
- assert(params.passes[0].type == PASS_COMBINED);
- float *in_combined = buffer.data();
-
- for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
- pixels[0] = in[0]*scale_exposure;
- pixels[1] = in[1]*scale_exposure;
- pixels[2] = in[2]*scale_exposure;
- pixels[3] = saturate(in_combined[3]*scale);
- }
- }
- else {
- return false;
- }
+ return false;
}
return true;
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 46c3b89bd84..0a010718d6d 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -54,6 +54,10 @@ public:
bool denoising_data_pass;
/* If only some light path types should be denoised, an additional pass is needed. */
bool denoising_clean_pass;
+ /* When we're prefiltering the passes during rendering, we need to keep both the
+ * original and the prefiltered data around because neighboring tiles might still
+ * need the original data. */
+ bool denoising_prefiltered_pass;
/* functions */
BufferParams();
@@ -63,6 +67,7 @@ public:
void add_pass(PassType type);
int get_passes_size();
int get_denoising_offset();
+ int get_denoising_prefiltered_offset();
};
/* Render Buffers */
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index d0f15496e50..b305fa59123 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -286,6 +286,7 @@ NODE_DEFINE(Film)
SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
+ SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
SOCKET_INT(denoising_flags, "Denoising Flags", 0);
return type;
@@ -469,6 +470,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
kfilm->use_light_pass = 1;
}
+ if(denoising_prefiltered_pass) {
+ kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED;
+ }
}
kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index c597db4e4c5..8330a4cf413 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -60,6 +60,7 @@ public:
vector<Pass> passes;
bool denoising_data_pass;
bool denoising_clean_pass;
+ bool denoising_prefiltered_pass;
int denoising_flags;
float pass_alpha_threshold;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index c818f2b496c..3cee3b8bece 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -689,7 +689,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
BakeManager *bake_manager = scene->bake_manager;
requested_features.use_baking = bake_manager->get_baking();
requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
- if(params.denoising_passes) {
+ if(params.run_denoising) {
requested_features.use_denoising = true;
requested_features.use_shadow_tricks = true;
}
@@ -927,7 +927,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
*/
substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
}
- if(params.use_denoising) {
+ if(params.run_denoising) {
substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
}
}
@@ -975,7 +975,7 @@ void Session::render()
task.requested_tile_size = params.tile_size;
task.passes_size = tile_manager.params.get_passes_size();
- if(params.use_denoising) {
+ if(params.run_denoising) {
task.denoising_radius = params.denoising_radius;
task.denoising_strength = params.denoising_strength;
task.denoising_feature_strength = params.denoising_feature_strength;
@@ -983,8 +983,13 @@ void Session::render()
assert(!scene->film->need_update);
task.pass_stride = scene->film->pass_stride;
+ task.target_pass_stride = task.pass_stride;
task.pass_denoising_data = scene->film->denoising_data_offset;
task.pass_denoising_clean = scene->film->denoising_clean_offset;
+
+ task.denoising_from_render = true;
+ task.denoising_do_filter = params.full_denoising;
+ task.denoising_write_passes = params.write_denoising_passes;
}
device->task_add(task);
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index c7f590915e7..cb1d8fed68f 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -60,8 +60,9 @@ public:
bool display_buffer_linear;
- bool use_denoising;
- bool denoising_passes;
+ bool run_denoising;
+ bool write_denoising_passes;
+ bool full_denoising;
int denoising_radius;
float denoising_strength;
float denoising_feature_strength;
@@ -94,8 +95,9 @@ public:
use_profiling = false;
- use_denoising = false;
- denoising_passes = false;
+ run_denoising = false;
+ write_denoising_passes = false;
+ full_denoising = false;
denoising_radius = 8;
denoising_strength = 0.0f;
denoising_feature_strength = 0.0f;