Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/blender/addon/engine.py9
-rw-r--r--intern/cycles/blender/blender_session.cpp17
-rw-r--r--intern/cycles/blender/blender_sync.cpp35
-rw-r--r--intern/cycles/device/device_cpu.cpp71
-rw-r--r--intern/cycles/device/device_cuda.cpp169
-rw-r--r--intern/cycles/device/device_denoising.cpp72
-rw-r--r--intern/cycles/device/device_denoising.h22
-rw-r--r--intern/cycles/device/device_task.h6
-rw-r--r--intern/cycles/device/opencl/opencl.h15
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp71
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h1
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h19
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h36
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h39
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h12
-rw-r--r--intern/cycles/kernel/kernel_types.h9
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h14
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h56
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu36
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl34
-rw-r--r--intern/cycles/render/buffers.cpp123
-rw-r--r--intern/cycles/render/buffers.h5
-rw-r--r--intern/cycles/render/film.cpp4
-rw-r--r--intern/cycles/render/film.h1
-rw-r--r--intern/cycles/render/session.cpp11
-rw-r--r--intern/cycles/render/session.h10
26 files changed, 644 insertions, 253 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 23239ee4352..83b9a8eee0c 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -269,14 +269,11 @@ def register_passes(engine, scene, srl):
engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR')
if crl.denoising_store_passes:
engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR')
- engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR')
- engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR')
engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE')
- engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE')
- engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR')
- engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR')
- engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR')
+ engine.register_pass(scene, srl, "Denoising Shadowing", 1, "X", 'VALUE')
+ engine.register_pass(scene, srl, "Denoising Variance", 3, "RGB", 'COLOR')
+ engine.register_pass(scene, srl, "Denoising Intensity", 1, "X", 'VALUE')
clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
"denoising_glossy_direct", "denoising_glossy_indirect",
"denoising_transmission_direct", "denoising_transmission_indirect",
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index dfa92dd1bc7..50ac35069a9 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -418,15 +418,19 @@ void BlenderSession::render()
buffer_params.passes = passes;
PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
- bool use_denoising = get_boolean(crl, "use_denoising");
- bool denoising_passes = use_denoising || get_boolean(crl, "denoising_store_passes");
+ bool full_denoising = get_boolean(crl, "use_denoising");
+ bool write_denoising_passes = get_boolean(crl, "denoising_store_passes");
- session->tile_manager.schedule_denoising = use_denoising;
- buffer_params.denoising_data_pass = denoising_passes;
+ bool run_denoising = full_denoising || write_denoising_passes;
+
+ session->tile_manager.schedule_denoising = run_denoising;
+ buffer_params.denoising_data_pass = run_denoising;
buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
+ buffer_params.denoising_prefiltered_pass = write_denoising_passes;
- session->params.use_denoising = use_denoising;
- session->params.denoising_passes = denoising_passes;
+ session->params.run_denoising = run_denoising;
+ session->params.full_denoising = full_denoising;
+ session->params.write_denoising_passes = write_denoising_passes;
session->params.denoising_radius = get_int(crl, "denoising_radius");
session->params.denoising_strength = get_float(crl, "denoising_strength");
session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
@@ -434,6 +438,7 @@ void BlenderSession::render()
scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass;
+ scene->film->denoising_prefiltered_pass = buffer_params.denoising_prefiltered_pass;
scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
scene->film->tag_passes_update(scene, passes);
scene->film->tag_update(scene);
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 703fcc2078b..a6050b66040 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -531,7 +531,7 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
{
string name = b_pass.name();
- if(name == "Noisy Image") return DENOISING_PASS_COLOR;
+ if(name == "Noisy Image") return DENOISING_PASS_PREFILTERED_COLOR;
if(name.substr(0, 10) != "Denoising ") {
return -1;
@@ -539,15 +539,12 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
name = name.substr(10);
#define MAP_PASS(passname, offset) if(name == passname) return offset;
- MAP_PASS("Normal", DENOISING_PASS_NORMAL);
- MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
- MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
- MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
- MAP_PASS("Depth", DENOISING_PASS_DEPTH);
- MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
- MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
- MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
- MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
+ MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL);
+ MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO);
+ MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH);
+ MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING);
+ MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE);
+ MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY);
MAP_PASS("Clean", DENOISING_PASS_CLEAN);
#undef MAP_PASS
@@ -579,10 +576,11 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
}
PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
- bool use_denoising = get_boolean(crp, "use_denoising");
- bool store_denoising_passes = get_boolean(crp, "denoising_store_passes");
+ bool full_denoising = get_boolean(crp, "use_denoising");
+ bool write_denoising_passes = get_boolean(crp, "denoising_store_passes");
+
scene->film->denoising_flags = 0;
- if(use_denoising || store_denoising_passes) {
+ if(full_denoising || write_denoising_passes) {
#define MAP_OPTION(name, flag) if(!get_boolean(crp, name)) scene->film->denoising_flags |= flag;
MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
@@ -596,16 +594,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
b_engine.add_pass("Noisy Image", 4, "RGBA", b_srlay.name().c_str());
}
- if(store_denoising_passes) {
+ if(write_denoising_passes) {
b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str());
- b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Shadowing", 1, "X", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Variance", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Intensity", 1, "X", b_srlay.name().c_str());
if(scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) {
b_engine.add_pass("Denoising Clean", 3, "RGB", b_srlay.name().c_str());
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 16908b0244a..6668acc9cbe 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -180,16 +180,17 @@ public:
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel;
- KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
- KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
+ KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)> filter_write_feature_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
@@ -218,6 +219,7 @@ public:
REGISTER_KERNEL(shader),
REGISTER_KERNEL(filter_divide_shadow),
REGISTER_KERNEL(filter_get_feature),
+ REGISTER_KERNEL(filter_write_feature),
REGISTER_KERNEL(filter_detect_outliers),
REGISTER_KERNEL(filter_combine_halves),
REGISTER_KERNEL(filter_nlm_calc_difference),
@@ -487,6 +489,8 @@ public:
int w = align_up(rect.z-rect.x, 4);
int h = rect.w-rect.y;
+ int stride = task->buffer.stride;
+ int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
float *blurDifference = temporary_mem;
@@ -504,9 +508,10 @@ public:
filter_nlm_calc_difference_kernel()(dx, dy,
(float*) guide_ptr,
(float*) variance_ptr,
+ NULL,
difference,
local_rect,
- w, 0,
+ w, channel_offset,
a, k_2);
filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
@@ -520,7 +525,8 @@ public:
(float*) out_ptr,
weightAccum,
local_rect,
- w, f);
+ channel_offset,
+ stride, f);
}
int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
@@ -550,16 +556,13 @@ public:
return true;
}
- bool denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ DenoisingTask *task)
{
ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
- mem_zero(task->storage.XtWX);
- mem_zero(task->storage.XtWY);
-
float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
float *difference = temporary_mem;
float *blurDifference = temporary_mem + task->buffer.pass_stride;
@@ -575,6 +578,7 @@ public:
filter_nlm_calc_difference_kernel()(dx, dy,
(float*) color_ptr,
(float*) color_variance_ptr,
+ (float*) scale_ptr,
difference,
local_rect,
task->buffer.stride,
@@ -597,6 +601,13 @@ public:
4,
task->buffer.pass_stride);
}
+
+ return true;
+ }
+
+ bool denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task)
+ {
for(int y = 0; y < task->filter_area.w; y++) {
for(int x = 0; x < task->filter_area.z; x++) {
filter_finalize_kernel()(x,
@@ -661,6 +672,7 @@ public:
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task)
{
ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
@@ -674,6 +686,7 @@ public:
x, y,
(float*) mean_ptr,
(float*) variance_ptr,
+ scale,
&task->rect.x,
task->render_buffer.pass_stride,
task->render_buffer.offset);
@@ -682,6 +695,26 @@ public:
return true;
}
+ bool denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+ {
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_write_feature_kernel()(task->render_buffer.samples,
+ x + task->filter_area.x,
+ y + task->filter_area.y,
+ &task->reconstruction_state.buffer_params.x,
+ (float*) from_ptr,
+ (float*) buffer_ptr,
+ out_offset,
+ &task->rect.x);
+ }
+ }
+ return true;
+ }
+
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
@@ -754,11 +787,13 @@ public:
tile.sample = tile.start_sample + tile.num_samples;
denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 7b3c25a86d5..cb7d8bbb224 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1300,7 +1300,7 @@ public:
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
- int channel_offset = 0;
+ int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
if(have_error())
return false;
@@ -1308,6 +1308,7 @@ public:
CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts;
+ CUdeviceptr scale_ptr = 0;
cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride));
cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride));
@@ -1326,10 +1327,10 @@ public:
CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
- void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
+ void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &r, &f};
+ void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1379,19 +1380,16 @@ public:
return !have_error();
}
- bool denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ DenoisingTask *task)
{
if(have_error())
return false;
CUDAContextScope scope(this);
- mem_zero(task->storage.XtWX);
- mem_zero(task->storage.XtWY);
-
int r = task->radius;
int f = 4;
float a = 1.0f;
@@ -1410,60 +1408,69 @@ public:
CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
- {
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
- cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
- CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
- task->reconstruction_state.source_w * task->reconstruction_state.source_h,
- num_shifts);
-
- void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2};
- void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
- void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *construct_gramian_args[] = {&blurDifference,
- &task->buffer.mem.device_pointer,
- &task->storage.transform.device_pointer,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->reconstruction_state.filter_window,
- &w, &h, &stride,
- &pass_stride, &r,
- &f};
-
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
- }
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+ CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+ task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+ num_shifts);
+
+ void *calc_difference_args[] = {&color_ptr,
+ &color_variance_ptr,
+ &scale_ptr,
+ &difference,
+ &w, &h,
+ &stride, &pass_stride,
+ &r, &pass_stride,
+ &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *construct_gramian_args[] = {&blurDifference,
+ &task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.filter_window,
+ &w, &h, &stride,
+ &pass_stride, &r,
+ &f};
+
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+ cuda_assert(cuCtxSynchronize());
- {
- CUfunction cuFinalize;
- cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
- cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
- void *finalize_args[] = {&output_ptr,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->filter_area,
- &task->reconstruction_state.buffer_params.x,
- &task->render_buffer.samples};
- CUDA_GET_BLOCKSIZE(cuFinalize,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
- CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
- }
+ return !have_error();
+ }
+ bool denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ CUfunction cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+ void *finalize_args[] = {&output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_GET_BLOCKSIZE(cuFinalize,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
cuda_assert(cuCtxSynchronize());
return !have_error();
@@ -1533,6 +1540,7 @@ public:
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task)
{
if(have_error())
@@ -1553,6 +1561,7 @@ public:
&variance_offset,
&mean_ptr,
&variance_ptr,
+ &scale,
&task->rect,
&task->render_buffer.pass_stride,
&task->render_buffer.offset};
@@ -1562,6 +1571,36 @@ public:
return !have_error();
}
+ bool denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterWriteFeature;
+ cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterWriteFeature,
+ task->filter_area.z,
+ task->filter_area.w);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->reconstruction_state.buffer_params,
+ &task->filter_area,
+ &from_ptr,
+ &buffer_ptr,
+ &out_offset,
+ &task->rect};
+ CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
@@ -1596,11 +1635,13 @@ public:
void denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
- denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 433cbd3c265..724171c3acb 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -39,11 +39,18 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
render_buffer.pass_stride = task.pass_stride;
render_buffer.offset = task.pass_denoising_data;
- target_buffer.pass_stride = task.pass_stride;
+ target_buffer.pass_stride = task.target_pass_stride;
target_buffer.denoising_clean_offset = task.pass_denoising_clean;
+ target_buffer.offset = 0;
functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
+
+ tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
+ tile_info->from_render = task.denoising_from_render? 1 : 0;
+
+ write_passes = task.denoising_write_passes;
+ do_filter = task.denoising_do_filter;
}
DenoisingTask::~DenoisingTask()
@@ -59,8 +66,6 @@ DenoisingTask::~DenoisingTask()
void DenoisingTask::set_render_buffer(RenderTile *rtiles)
{
- tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
-
for(int i = 0; i < 9; i++) {
tile_info->offsets[i] = rtiles[i].offset;
tile_info->strides[i] = rtiles[i].stride;
@@ -79,6 +84,13 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
target_buffer.stride = rtiles[9].stride;
target_buffer.ptr = rtiles[9].buffer;
+ if(write_passes && rtiles[9].buffers) {
+ target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset();
+ }
+ else {
+ target_buffer.denoising_output_offset = 0;
+ }
+
tile_info_mem.copy_to_device();
}
@@ -89,7 +101,8 @@ void DenoisingTask::setup_denoising_buffer()
rect = rect_expand(rect, radius);
rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
- buffer.passes = 14;
+ buffer.use_intensity = write_passes;
+ buffer.passes = buffer.use_intensity? 15 : 14;
buffer.width = rect.z - rect.x;
buffer.stride = align_up(buffer.width, 4);
buffer.h = rect.w - rect.y;
@@ -129,14 +142,14 @@ void DenoisingTask::prefilter_shadowing()
functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+ nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
/* Reuse memory, the previous data isn't needed anymore. */
device_ptr filtered_a = *buffer_var,
filtered_b = *sample_var;
/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
- nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+ nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
@@ -147,7 +160,7 @@ void DenoisingTask::prefilter_shadowing()
device_ptr final_a = *unfiltered_a,
final_b = *unfiltered_b;
/* Use the residual variance for a second filter pass. */
- nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+ nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
@@ -167,9 +180,9 @@ void DenoisingTask::prefilter_features()
for(int pass = 0; pass < 7; pass++) {
device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride);
/* Get the unfiltered pass and its variance from the RenderBuffers. */
- functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+ functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples);
/* Smooth the pass and store the result in the denoising buffers. */
- nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+ nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
}
}
@@ -188,13 +201,33 @@ void DenoisingTask::prefilter_color()
for(int pass = 0; pass < num_color_passes; pass++) {
device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride);
device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride);
- functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+ functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples);
}
device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride);
device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+
+ if(buffer.use_intensity) {
+ device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
+ nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true);
+ functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
+ }
+}
+
+void DenoisingTask::write_buffer()
+{
+ reconstruction_state.buffer_params = make_int4(target_buffer.offset,
+ target_buffer.stride,
+ target_buffer.pass_stride,
+ target_buffer.denoising_clean_offset);
+ int num_passes = buffer.use_intensity? 15 : 14;
+ for(int pass = 0; pass < num_passes; pass++) {
+ device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride);
+ int out_offset = pass + target_buffer.denoising_output_offset;
+ functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
+ }
}
void DenoisingTask::construct_transform()
@@ -212,6 +245,8 @@ void DenoisingTask::reconstruct()
{
storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
+ storage.XtWX.zero_to_device();
+ storage.XtWY.zero_to_device();
reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x;
@@ -224,7 +259,12 @@ void DenoisingTask::reconstruct()
device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride);
device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
- functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr);
+
+ device_ptr scale_ptr = 0;
+ device_sub_ptr *scale_sub_ptr = NULL;
+ functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr);
+ delete scale_sub_ptr;
+ functions.solve(target_buffer.ptr);
}
void DenoisingTask::run_denoising(RenderTile *tile)
@@ -240,8 +280,14 @@ void DenoisingTask::run_denoising(RenderTile *tile)
prefilter_features();
prefilter_color();
- construct_transform();
- reconstruct();
+ if(do_filter) {
+ construct_transform();
+ reconstruct();
+ }
+
+ if(write_passes) {
+ write_buffer();
+ }
functions.unmap_neighbor_tiles(rtiles);
}
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index beae60c220f..cddcd3bd0c9 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -47,6 +47,7 @@ public:
int stride;
int pass_stride;
int denoising_clean_offset;
+ int denoising_output_offset;
device_ptr ptr;
} target_buffer;
@@ -58,6 +59,9 @@ public:
int4 rect;
int4 filter_area;
+ bool write_passes;
+ bool do_filter;
+
struct DeviceFunctions {
function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
@@ -66,8 +70,9 @@ public:
)> non_local_means;
function<bool(device_ptr color_ptr,
device_ptr color_variance_ptr,
- device_ptr output_ptr
- )> reconstruct;
+ device_ptr scale_ptr
+ )> accumulate;
+ function<bool(device_ptr output_ptr)> solve;
function<bool()> construct_transform;
function<bool(device_ptr a_ptr,
@@ -86,13 +91,18 @@ public:
function<bool(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
- device_ptr variance_ptr
+ device_ptr variance_ptr,
+ float scale
)> get_feature;
function<bool(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
device_ptr output_ptr
)> detect_outliers;
+ function<bool(int out_offset,
+ device_ptr frop_ptr,
+ device_ptr buffer_ptr
+ )> write_feature;
function<void(RenderTile *rtiles)> map_neighbor_tiles;
function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
} functions;
@@ -114,8 +124,9 @@ public:
int f; /* Patch size of the filter. */
float a; /* Variance compensation factor in the MSE estimation. */
float k_2; /* Squared value of the k parameter of the filter. */
+ bool is_color;
- void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+ void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; }
} nlm_state;
struct Storage {
@@ -147,6 +158,7 @@ public:
int width;
device_only_memory<float> mem;
device_only_memory<float> temporary_mem;
+ bool use_intensity;
bool gpu_temporary_mem;
@@ -166,6 +178,8 @@ protected:
void prefilter_color();
void construct_transform();
void reconstruct();
+
+ void write_buffer();
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 861014373b3..97bcde99af6 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -72,7 +72,13 @@ public:
float denoising_strength;
float denoising_feature_strength;
bool denoising_relative_pca;
+ bool denoising_from_render;
+
+ bool denoising_do_filter;
+ bool denoising_write_passes;
+
int pass_stride;
+ int target_pass_stride;
int pass_denoising_data;
int pass_denoising_clean;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index ea7ed4f1909..4d42ddc0c53 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -419,10 +419,12 @@ protected:
device_ptr out_ptr,
DenoisingTask *task);
bool denoising_construct_transform(DenoisingTask *task);
- bool denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task);
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ DenoisingTask *task);
+ bool denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task);
bool denoising_combine_halves(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr mean_ptr,
@@ -439,7 +441,12 @@ protected:
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task);
+ bool denoising_write_feature(int to_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task);
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index d4d7c0f74bc..a0a1cf68c32 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -748,6 +748,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
+ int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
@@ -760,6 +761,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
cl_mem out_mem = CL_MEM_PTR(out_ptr);
+ cl_mem scale_mem = NULL;
mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride);
mem_zero_kernel(out_ptr, sizeof(float)*pass_stride);
@@ -773,10 +775,12 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
kernel_set_args(ckNLMCalcDifference, 0,
guide_mem,
variance_mem,
+ scale_mem,
difference_mem,
w, h, stride,
pass_stride,
- r, 0, a, k_2);
+ r, channel_offset,
+ 0, a, k_2);
kernel_set_args(ckNLMBlur, 0,
difference_mem,
blurDifference_mem,
@@ -796,6 +800,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
weightAccum_mem,
w, h, stride,
pass_stride,
+ channel_offset,
r, f);
enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
@@ -837,17 +842,14 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
return true;
}
-bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
+bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ DenoisingTask *task)
{
- mem_zero(task->storage.XtWX);
- mem_zero(task->storage.XtWY);
-
cl_mem color_mem = CL_MEM_PTR(color_ptr);
cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
- cl_mem output_mem = CL_MEM_PTR(output_ptr);
+ cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
@@ -859,7 +861,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
- cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
int w = task->reconstruction_state.source_w;
int h = task->reconstruction_state.source_h;
@@ -877,6 +878,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
kernel_set_args(ckNLMCalcDifference, 0,
color_mem,
color_variance_mem,
+ scale_mem,
difference_mem,
w, h, stride,
pass_stride,
@@ -913,6 +915,22 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task)
+{
+ cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+ cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+ int w = task->reconstruction_state.source_w;
+ int h = task->reconstruction_state.source_h;
+
kernel_set_args(ckFinalize, 0,
output_mem,
rank_mem,
@@ -1000,6 +1018,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task)
{
cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
@@ -1023,6 +1042,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
variance_offset,
mean_mem,
variance_mem,
+ scale,
task->rect,
task->render_buffer.pass_stride,
task->render_buffer.offset);
@@ -1033,6 +1053,31 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
return true;
}
+bool OpenCLDeviceBase::denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+{
+ cl_mem from_mem = CL_MEM_PTR(from_ptr);
+ cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
+
+ cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
+
+ kernel_set_args(ckFilterWriteFeature, 0,
+ task->render_buffer.samples,
+ task->reconstruction_state.buffer_params,
+ task->filter_area,
+ from_mem,
+ buffer_mem,
+ out_offset,
+ task->rect);
+ enqueue_kernel(ckFilterWriteFeature,
+ task->filter_area.z,
+ task->filter_area.w);
+
+ return true;
+}
+
bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
@@ -1063,11 +1108,13 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
- denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(&OpenCLDeviceBase::denoising_write_feature, this, _1, _2, _3, &denoising);
denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index 67f4e62ac0f..9ac7c3db23d 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -27,6 +27,7 @@ typedef struct TileInfo {
int strides[9];
int x[4];
int y[4];
+ int from_render;
/* TODO(lukas): CUDA doesn't have uint64_t... */
#ifdef __KERNEL_OPENCL__
ccl_global float *buffers[9];
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index af73c0dadf2..0c4387af540 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
const float *ccl_restrict weight_image,
const float *ccl_restrict variance_image,
+ const float *ccl_restrict scale_image,
float *difference_image,
int4 rect,
int stride,
@@ -41,13 +42,21 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
int idx_q = (y+dy)*stride + aligned_lowx + dx;
for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
float4 diff = make_float4(0.0f);
+ float4 scale_fac;
+ if(scale_image) {
+ scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
+ make_float4(0.25f), make_float4(4.0f));
+ }
+ else {
+ scale_fac = make_float4(1.0f);
+ }
for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
/* idx_p is guaranteed to be aligned, but idx_q isn't. */
float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
- float4 color_q = load4_u(weight_image, idx_q + chan_ofs);
+ float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs);
float4 cdiff = color_p - color_q;
float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
- float4 var_q = load4_u(variance_image, idx_q + chan_ofs);
+ float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs);
diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
}
load4_a(difference_image, idx_p) = diff*channel_fac;
@@ -143,6 +152,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
float *out_image,
float *accum_image,
int4 rect,
+ int channel_offset,
int stride,
int f)
{
@@ -160,6 +170,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
load4_a(accum_image, idx_p) += mask(active, weight);
float4 val = load4_u(image, idx_q);
+ if(channel_offset) {
+ val += load4_u(image, idx_q + channel_offset);
+ val += load4_u(image, idx_q + 2*channel_offset);
+ val *= 1.0f/3.0f;
+ }
load4_a(out_image, idx_p) += mask(active, weight*val);
}
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 058afb34a92..d8e2e4d08aa 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -78,17 +78,25 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
int dx, int dy,
const ccl_global float *ccl_restrict weight_image,
const ccl_global float *ccl_restrict variance_image,
+ const ccl_global float *ccl_restrict scale_image,
ccl_global float *difference_image,
int4 rect, int stride,
int channel_offset,
float a, float k_2)
{
- float diff = 0.0f;
+ int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
int numChannels = channel_offset? 3 : 1;
- for(int c = 0; c < numChannels; c++) {
- float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
- float pvar = variance_image[c*channel_offset + y*stride + x];
- float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+
+ float diff = 0.0f;
+ float scale_fac = 1.0f;
+ if(scale_image) {
+ scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
+ }
+
+ for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
+ float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q];
+ float pvar = variance_image[idx_p];
+ float qvar = sqr(scale_fac)*variance_image[idx_q];
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
}
if(numChannels > 1) {
@@ -133,7 +141,8 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
const ccl_global float *ccl_restrict image,
ccl_global float *out_image,
ccl_global float *accum_image,
- int4 rect, int stride, int f)
+ int4 rect, int channel_offset,
+ int stride, int f)
{
float sum = 0.0f;
const int low = max(rect.x, x-f);
@@ -142,12 +151,21 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
sum += difference_image[y*stride + x1];
}
sum *= 1.0f/(high-low);
+
+ int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
if(out_image) {
- atomic_add_and_fetch_float(accum_image + y*stride + x, sum);
- atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]);
+ atomic_add_and_fetch_float(accum_image + idx_p, sum);
+
+ float val = image[idx_q];
+ if(channel_offset) {
+ val += image[idx_q + channel_offset];
+ val += image[idx_q + 2*channel_offset];
+ val *= 1.0f/3.0f;
+ }
+ atomic_add_and_fetch_float(out_image + idx_p, sum*val);
}
else {
- accum_image[y*stride + x] = sum;
+ accum_image[idx_p] = sum;
}
}
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index 3507f80df46..41be4dbea49 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -84,6 +84,7 @@ ccl_device void kernel_filter_get_feature(int sample,
int x, int y,
ccl_global float *mean,
ccl_global float *variance,
+ float scale,
int4 rect, int buffer_pass_stride,
int buffer_denoising_offset)
{
@@ -95,18 +96,38 @@ ccl_device void kernel_filter_get_feature(int sample,
int buffer_w = align_up(rect.z - rect.x, 4);
int idx = (y-rect.y)*buffer_w + (x - rect.x);
- mean[idx] = center_buffer[m_offset] / sample;
- if(sample > 1) {
- /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
- * update does not work efficiently with atomics in the kernel. */
- variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
- }
- else {
- /* Can't compute variance with single sample, just set it very high. */
- variance[idx] = 1e10f;
+ float val = scale * center_buffer[m_offset];
+ mean[idx] = val;
+
+ if(v_offset >= 0) {
+ if(sample > 1) {
+ /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+ * update does not work efficiently with atomics in the kernel. */
+ variance[idx] = max(0.0f, (center_buffer[v_offset] - val*val*sample) / (sample * (sample-1)));
+ }
+ else {
+ /* Can't compute variance with single sample, just set it very high. */
+ variance[idx] = 1e10f;
+ }
}
}
+ccl_device void kernel_filter_write_feature(int sample,
+ int x, int y,
+ int4 buffer_params,
+ ccl_global float *from,
+ ccl_global float *buffer,
+ int out_offset,
+ int4 rect)
+{
+ ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ combined_buffer[out_offset] = from[idx];
+}
+
ccl_device void kernel_filter_detect_outliers(int x, int y,
ccl_global float *image,
ccl_global float *variance,
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index 58740d5b06a..e5d3b0da835 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -108,11 +108,13 @@ ccl_device_inline void kernel_filter_finalize(int x, int y,
final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
- final_color *= sample;
- if(buffer_params.w) {
- final_color.x += combined_buffer[buffer_params.w+0];
- final_color.y += combined_buffer[buffer_params.w+1];
- final_color.z += combined_buffer[buffer_params.w+2];
+ if(buffer_params.w >= 0) {
+ final_color *= sample;
+ if(buffer_params.w > 0) {
+ final_color.x += combined_buffer[buffer_params.w+0];
+ final_color.y += combined_buffer[buffer_params.w+1];
+ final_color.z += combined_buffer[buffer_params.w+2];
+ }
}
combined_buffer[0] = final_color.x;
combined_buffer[1] = final_color.y;
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 864aa7c470a..caa0057d997 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -472,8 +472,17 @@ typedef enum DenoisingPassOffsets {
DENOISING_PASS_COLOR_VAR = 23,
DENOISING_PASS_CLEAN = 26,
+ DENOISING_PASS_PREFILTERED_DEPTH = 0,
+ DENOISING_PASS_PREFILTERED_NORMAL = 1,
+ DENOISING_PASS_PREFILTERED_SHADOWING = 4,
+ DENOISING_PASS_PREFILTERED_ALBEDO = 5,
+ DENOISING_PASS_PREFILTERED_COLOR = 8,
+ DENOISING_PASS_PREFILTERED_VARIANCE = 11,
+ DENOISING_PASS_PREFILTERED_INTENSITY = 14,
+
DENOISING_PASS_SIZE_BASE = 26,
DENOISING_PASS_SIZE_CLEAN = 3,
+ DENOISING_PASS_SIZE_PREFILTERED = 15,
} DenoisingPassOffsets;
typedef enum eBakePassFilter {
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index e036b53b810..08333c7a455 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -37,10 +37,20 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
int y,
float *mean,
float *variance,
+ float scale,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset);
+void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
+ int x,
+ int y,
+ int *buffer_params,
+ float *from,
+ float *buffer,
+ int out_offset,
+ int* prefilter_rect);
+
void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
ccl_global float *image,
ccl_global float *variance,
@@ -71,7 +81,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int dy,
float *weight_image,
- float *variance,
+ float *variance_image,
+ float *scale_image,
float *difference_image,
int* rect,
int stride,
@@ -99,6 +110,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
float *out_image,
float *accum_image,
int* rect,
+ int channel_offset,
int stride,
int f);
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 4c758711481..b792367e3ab 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -69,6 +69,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
int x,
int y,
float *mean, float *variance,
+ float scale,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset)
@@ -80,12 +81,29 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
m_offset, v_offset,
x, y,
mean, variance,
+ scale,
load_int4(prefilter_rect),
buffer_pass_stride,
buffer_denoising_offset);
#endif
}
+void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
+ int x,
+ int y,
+ int *buffer_params,
+ float *from,
+ float *buffer,
+ int out_offset,
+ int* prefilter_rect)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_write_feature);
+#else
+ kernel_filter_write_feature(sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect));
+#endif
+}
+
void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
ccl_global float *image,
ccl_global float *variance,
@@ -130,8 +148,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
#else
- rank += storage_ofs;
- transform += storage_ofs*TRANSFORM_SIZE;
+ rank += storage_ofs;
+ transform += storage_ofs*TRANSFORM_SIZE;
kernel_filter_construct_transform(buffer,
x, y,
load_int4(prefilter_rect),
@@ -146,7 +164,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int dy,
float *weight_image,
- float *variance,
+ float *variance_image,
+ float *scale_image,
float *difference_image,
int *rect,
int stride,
@@ -157,7 +176,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
#else
- kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2);
+ kernel_filter_nlm_calc_difference(dx, dy,
+ weight_image,
+ variance_image,
+ scale_image,
+ difference_image,
+ load_int4(rect),
+ stride,
+ channel_offset,
+ a, k_2);
#endif
}
@@ -195,13 +222,22 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
float *out_image,
float *accum_image,
int *rect,
+ int channel_offset,
int stride,
int f)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
#else
- kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f);
+ kernel_filter_nlm_update_output(dx, dy,
+ difference_image,
+ image,
+ temp_image,
+ out_image,
+ accum_image,
+ load_int4(rect),
+ channel_offset,
+ stride, f);
#endif
}
@@ -222,7 +258,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
#else
- kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride);
+ kernel_filter_nlm_construct_gramian(dx, dy,
+ difference_image,
+ buffer,
+ transform, rank,
+ XtWX, XtWY,
+ load_int4(rect),
+ load_int4(filter_window),
+ stride, f,
+ pass_stride);
#endif
}
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index b856cbde45c..3b51bb41aed 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -64,6 +64,7 @@ kernel_cuda_filter_get_feature(int sample,
int v_offset,
float *mean,
float *variance,
+ float scale,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset)
@@ -76,6 +77,7 @@ kernel_cuda_filter_get_feature(int sample,
m_offset, v_offset,
x, y,
mean, variance,
+ scale,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset);
@@ -84,6 +86,30 @@ kernel_cuda_filter_get_feature(int sample,
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_write_feature(int sample,
+ int4 buffer_params,
+ int4 filter_area,
+ float *from,
+ float *buffer,
+ int out_offset,
+ int4 prefilter_rect)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ kernel_filter_write_feature(sample,
+ x + filter_area.x,
+ y + filter_area.y,
+ buffer_params,
+ from,
+ buffer,
+ out_offset,
+ prefilter_rect);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_detect_outliers(float *image,
float *variance,
float *depth,
@@ -136,6 +162,7 @@ extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
const float *ccl_restrict variance_image,
+ const float *ccl_restrict scale_image,
float *difference_image,
int w,
int h,
@@ -152,9 +179,11 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
weight_image,
variance_image,
+ scale_image,
difference_image + ofs,
rect, stride,
- channel_offset, a, k_2);
+ channel_offset,
+ a, k_2);
}
}
@@ -210,6 +239,7 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
int h,
int stride,
int pass_stride,
+ int channel_offset,
int r,
int f)
{
@@ -221,7 +251,9 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
image,
out_image,
accum_image,
- rect, stride, f);
+ rect,
+ channel_offset,
+ stride, f);
}
}
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index a550f97f4eb..8a821ee281d 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -56,6 +56,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
int v_offset,
ccl_global float *mean,
ccl_global float *variance,
+ float scale,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset)
@@ -68,12 +69,35 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
m_offset, v_offset,
x, y,
mean, variance,
+ scale,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset);
}
}
+__kernel void kernel_ocl_filter_write_feature(int sample,
+ int4 buffer_params,
+ int4 filter_area,
+ ccl_global float *from,
+ ccl_global float *buffer,
+ int out_offset,
+ int4 prefilter_rect)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ kernel_filter_write_feature(sample,
+ x + filter_area.x,
+ y + filter_area.y,
+ buffer_params,
+ from,
+ buffer,
+ out_offset,
+ prefilter_rect);
+ }
+}
+
__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
ccl_global float *variance,
ccl_global float *depth,
@@ -128,6 +152,7 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
const ccl_global float *ccl_restrict variance_image,
+ const ccl_global float *ccl_restrict scale_image,
ccl_global float *difference_image,
int w,
int h,
@@ -144,9 +169,11 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
weight_image,
variance_image,
+ scale_image,
difference_image + ofs,
rect, stride,
- channel_offset, a, k_2);
+ channel_offset,
+ a, k_2);
}
}
@@ -196,6 +223,7 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
int h,
int stride,
int pass_stride,
+ int channel_offset,
int r,
int f)
{
@@ -207,7 +235,9 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
image,
out_image,
accum_image,
- rect, stride, f);
+ rect,
+ channel_offset,
+ stride, f);
}
}
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index f901885e679..66b8ef73acc 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -42,6 +42,7 @@ BufferParams::BufferParams()
denoising_data_pass = false;
denoising_clean_pass = false;
+ denoising_prefiltered_pass = false;
Pass::add(PASS_COMBINED, passes);
}
@@ -73,6 +74,7 @@ int BufferParams::get_passes_size()
if(denoising_data_pass) {
size += DENOISING_PASS_SIZE_BASE;
if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
+ if(denoising_prefiltered_pass) size += DENOISING_PASS_SIZE_PREFILTERED;
}
return align_up(size, 4);
@@ -88,6 +90,20 @@ int BufferParams::get_denoising_offset()
return offset;
}
+int BufferParams::get_denoising_prefiltered_offset()
+{
+ assert(denoising_prefiltered_pass);
+
+ int offset = get_denoising_offset();
+
+ offset += DENOISING_PASS_SIZE_BASE;
+ if(denoising_clean_pass) {
+ offset += DENOISING_PASS_SIZE_CLEAN;
+ }
+
+ return offset;
+}
+
/* Render Buffer Task */
RenderTile::RenderTile()
@@ -153,81 +169,62 @@ bool RenderBuffers::get_denoising_pass_rect(int type, float exposure, int sample
return false;
}
- float invsample = 1.0f/sample;
- float scale = invsample;
- bool variance = (type == DENOISING_PASS_NORMAL_VAR) ||
- (type == DENOISING_PASS_ALBEDO_VAR) ||
- (type == DENOISING_PASS_DEPTH_VAR) ||
- (type == DENOISING_PASS_COLOR_VAR);
+ float scale = 1.0f;
+ float alpha_scale = 1.0f/sample;
+ if(type == DENOISING_PASS_PREFILTERED_COLOR ||
+ type == DENOISING_PASS_CLEAN ||
+ type == DENOISING_PASS_PREFILTERED_INTENSITY) {
+ scale *= exposure;
+ }
+ else if(type == DENOISING_PASS_PREFILTERED_VARIANCE) {
+ scale *= exposure*exposure * (sample - 1);
+ }
- float scale_exposure = scale;
- if(type == DENOISING_PASS_COLOR || type == DENOISING_PASS_CLEAN) {
- scale_exposure *= exposure;
+ int offset;
+ if(type == DENOISING_PASS_CLEAN) {
+ /* The clean pass isn't changed by prefiltering, so we use the original one there. */
+ offset = type + params.get_denoising_offset();
}
- else if(type == DENOISING_PASS_COLOR_VAR) {
- scale_exposure *= exposure*exposure;
+ else if (type == DENOISING_PASS_PREFILTERED_COLOR && !params.denoising_prefiltered_pass) {
+ /* If we're not saving the prefiltering result, return the original noisy pass. */
+ offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
+ scale /= sample;
+ }
+ else {
+ offset = type + params.get_denoising_prefiltered_offset();
}
- int offset = type + params.get_denoising_offset();
int pass_stride = params.get_passes_size();
int size = params.width*params.height;
- if(variance) {
- /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
- * update does not work efficiently with atomics in the kernel. */
- int mean_offset = offset - components;
- float *mean = buffer.data() + mean_offset;
- float *var = buffer.data() + offset;
- assert(mean_offset >= 0);
-
- if(components == 1) {
- for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels++) {
- pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure;
- }
+ float *in = buffer.data() + offset;
+
+ if(components == 1) {
+ for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+ pixels[0] = in[0]*scale;
}
- else if(components == 3) {
- for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels += 3) {
- pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure;
- pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale_exposure;
- pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale_exposure;
- }
+ }
+ else if(components == 3) {
+ for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+ pixels[0] = in[0]*scale;
+ pixels[1] = in[1]*scale;
+ pixels[2] = in[2]*scale;
}
- else {
- return false;
+ }
+ else if(components == 4) {
+ /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
+ assert(params.passes[0].type == PASS_COMBINED);
+ float *in_combined = buffer.data();
+
+ for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
+ pixels[0] = in[0]*scale;
+ pixels[1] = in[1]*scale;
+ pixels[2] = in[2]*scale;
+ pixels[3] = saturate(in_combined[3]*alpha_scale);
}
}
else {
- float *in = buffer.data() + offset;
-
- if(components == 1) {
- for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
- pixels[0] = in[0]*scale_exposure;
- }
- }
- else if(components == 3) {
- for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
- pixels[0] = in[0]*scale_exposure;
- pixels[1] = in[1]*scale_exposure;
- pixels[2] = in[2]*scale_exposure;
- }
- }
- else if(components == 4) {
- assert(type == DENOISING_PASS_COLOR);
-
- /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
- assert(params.passes[0].type == PASS_COMBINED);
- float *in_combined = buffer.data();
-
- for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
- pixels[0] = in[0]*scale_exposure;
- pixels[1] = in[1]*scale_exposure;
- pixels[2] = in[2]*scale_exposure;
- pixels[3] = saturate(in_combined[3]*scale);
- }
- }
- else {
- return false;
- }
+ return false;
}
return true;
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 46c3b89bd84..0a010718d6d 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -54,6 +54,10 @@ public:
bool denoising_data_pass;
/* If only some light path types should be denoised, an additional pass is needed. */
bool denoising_clean_pass;
+ /* When we're prefiltering the passes during rendering, we need to keep both the
+ * original and the prefiltered data around because neighboring tiles might still
+ * need the original data. */
+ bool denoising_prefiltered_pass;
/* functions */
BufferParams();
@@ -63,6 +67,7 @@ public:
void add_pass(PassType type);
int get_passes_size();
int get_denoising_offset();
+ int get_denoising_prefiltered_offset();
};
/* Render Buffers */
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index d0f15496e50..b305fa59123 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -286,6 +286,7 @@ NODE_DEFINE(Film)
SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
+ SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
SOCKET_INT(denoising_flags, "Denoising Flags", 0);
return type;
@@ -469,6 +470,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
kfilm->use_light_pass = 1;
}
+ if(denoising_prefiltered_pass) {
+ kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED;
+ }
}
kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index c597db4e4c5..8330a4cf413 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -60,6 +60,7 @@ public:
vector<Pass> passes;
bool denoising_data_pass;
bool denoising_clean_pass;
+ bool denoising_prefiltered_pass;
int denoising_flags;
float pass_alpha_threshold;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index c818f2b496c..3cee3b8bece 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -689,7 +689,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
BakeManager *bake_manager = scene->bake_manager;
requested_features.use_baking = bake_manager->get_baking();
requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
- if(params.denoising_passes) {
+ if(params.run_denoising) {
requested_features.use_denoising = true;
requested_features.use_shadow_tricks = true;
}
@@ -927,7 +927,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
*/
substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
}
- if(params.use_denoising) {
+ if(params.run_denoising) {
substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
}
}
@@ -975,7 +975,7 @@ void Session::render()
task.requested_tile_size = params.tile_size;
task.passes_size = tile_manager.params.get_passes_size();
- if(params.use_denoising) {
+ if(params.run_denoising) {
task.denoising_radius = params.denoising_radius;
task.denoising_strength = params.denoising_strength;
task.denoising_feature_strength = params.denoising_feature_strength;
@@ -983,8 +983,13 @@ void Session::render()
assert(!scene->film->need_update);
task.pass_stride = scene->film->pass_stride;
+ task.target_pass_stride = task.pass_stride;
task.pass_denoising_data = scene->film->denoising_data_offset;
task.pass_denoising_clean = scene->film->denoising_clean_offset;
+
+ task.denoising_from_render = true;
+ task.denoising_do_filter = params.full_denoising;
+ task.denoising_write_passes = params.write_denoising_passes;
}
device->task_add(task);
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index c7f590915e7..cb1d8fed68f 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -60,8 +60,9 @@ public:
bool display_buffer_linear;
- bool use_denoising;
- bool denoising_passes;
+ bool run_denoising;
+ bool write_denoising_passes;
+ bool full_denoising;
int denoising_radius;
float denoising_strength;
float denoising_feature_strength;
@@ -94,8 +95,9 @@ public:
use_profiling = false;
- use_denoising = false;
- denoising_passes = false;
+ run_denoising = false;
+ write_denoising_passes = false;
+ full_denoising = false;
denoising_radius = 8;
denoising_strength = 0.0f;
denoising_feature_strength = 0.0f;