Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLukas Stockner <lukas.stockner@freenet.de>2019-02-06 16:19:20 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2019-02-06 17:18:42 +0300
commitfccf506ed7fd96f8a8f5edda7b99f564a386321a (patch)
tree80a4d10012b13e1601011e5cf6d4771d0e382775 /intern/cycles/device
parentc183ac73dcfd20d0acf5ca07a2b062deadc4d73a (diff)
Cycles: animation denoising support in the kernel.
This is the internal implementation, not available from the API or interface yet. The algorithm takes into account past and future frames, both to get more coherent animation and reduce noise. Ref D3889.
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/device_cpu.cpp23
-rw-r--r--intern/cycles/device/device_cuda.cpp21
-rw-r--r--intern/cycles/device/device_denoising.cpp59
-rw-r--r--intern/cycles/device/device_denoising.h7
-rw-r--r--intern/cycles/device/device_task.h2
-rw-r--r--intern/cycles/device/opencl/opencl.h1
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp31
7 files changed, 117 insertions, 27 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 6668acc9cbe..93c63b92a55 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -186,15 +186,15 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel;
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel;
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
- KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
- KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
+ KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -512,7 +512,7 @@ public:
difference,
local_rect,
w, channel_offset,
- a, k_2);
+ 0, a, k_2);
filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
@@ -542,6 +542,7 @@ public:
for(int y = 0; y < task->filter_area.w; y++) {
for(int x = 0; x < task->filter_area.z; x++) {
filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+ task->tile_info,
x + task->filter_area.x,
y + task->filter_area.y,
y*task->filter_area.z + x,
@@ -549,6 +550,8 @@ public:
(int*) task->storage.rank.device_pointer,
&task->rect.x,
task->buffer.pass_stride,
+ task->buffer.frame_stride,
+ task->buffer.use_time,
task->radius,
task->pca_threshold);
}
@@ -559,6 +562,7 @@ public:
bool denoising_accumulate(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr scale_ptr,
+ int frame,
DenoisingTask *task)
{
ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
@@ -568,6 +572,7 @@ public:
float *blurDifference = temporary_mem + task->buffer.pass_stride;
int r = task->radius;
+ int frame_offset = frame * task->buffer.frame_stride;
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
int dy = i / (2*r+1) - r;
int dx = i % (2*r+1) - r;
@@ -583,12 +588,14 @@ public:
local_rect,
task->buffer.stride,
task->buffer.pass_stride,
+ frame_offset,
1.0f,
task->nlm_k_2);
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
filter_nlm_construct_gramian_kernel()(dx, dy,
+ task->tile_info->frames[frame],
blurDifference,
(float*) task->buffer.mem.device_pointer,
(float*) task->storage.transform.device_pointer,
@@ -599,7 +606,9 @@ public:
&task->reconstruction_state.filter_window.x,
task->buffer.stride,
4,
- task->buffer.pass_stride);
+ task->buffer.pass_stride,
+ frame_offset,
+ task->buffer.use_time);
}
return true;
@@ -787,7 +796,7 @@ public:
tile.sample = tile.start_sample + tile.num_samples;
denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index cb7d8bbb224..e21d974ebbe 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1301,6 +1301,7 @@ public:
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
+ int frame_offset = 0;
if(have_error())
return false;
@@ -1327,7 +1328,7 @@ public:
CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
- void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
+ void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2};
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
@@ -1367,13 +1368,16 @@ public:
task->storage.h);
void *args[] = {&task->buffer.mem.device_pointer,
+ &task->tile_info_mem.device_pointer,
&task->storage.transform.device_pointer,
&task->storage.rank.device_pointer,
&task->filter_area,
&task->rect,
&task->radius,
&task->pca_threshold,
- &task->buffer.pass_stride};
+ &task->buffer.pass_stride,
+ &task->buffer.frame_stride,
+ &task->buffer.use_time};
CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
cuda_assert(cuCtxSynchronize());
@@ -1383,6 +1387,7 @@ public:
bool denoising_accumulate(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr scale_ptr,
+ int frame,
DenoisingTask *task)
{
if(have_error())
@@ -1398,6 +1403,8 @@ public:
int w = task->reconstruction_state.source_w;
int h = task->reconstruction_state.source_h;
int stride = task->buffer.stride;
+ int frame_offset = frame * task->buffer.frame_stride;
+ int t = task->tile_info->frames[frame];
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
@@ -1430,10 +1437,12 @@ public:
&w, &h,
&stride, &pass_stride,
&r, &pass_stride,
+ &frame_offset,
&a, &k_2};
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *construct_gramian_args[] = {&blurDifference,
+ void *construct_gramian_args[] = {&t,
+ &blurDifference,
&task->buffer.mem.device_pointer,
&task->storage.transform.device_pointer,
&task->storage.rank.device_pointer,
@@ -1442,7 +1451,9 @@ public:
&task->reconstruction_state.filter_window,
&w, &h, &stride,
&pass_stride, &r,
- &f};
+ &f,
+ &frame_offset,
+ &task->buffer.use_time};
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1635,7 +1646,7 @@ public:
void denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 724171c3acb..61e0ba47ab8 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -36,6 +36,7 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
}
+ render_buffer.frame_stride = task.frame_stride;
render_buffer.pass_stride = task.pass_stride;
render_buffer.offset = task.pass_denoising_data;
@@ -49,6 +50,12 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
tile_info->from_render = task.denoising_from_render? 1 : 0;
+ tile_info->frames[0] = 0;
+ tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
+ for(int i = 1; i < tile_info->num_frames; i++) {
+ tile_info->frames[i] = task.denoising_frames[i-1];
+ }
+
write_passes = task.denoising_write_passes;
do_filter = task.denoising_do_filter;
}
@@ -101,16 +108,18 @@ void DenoisingTask::setup_denoising_buffer()
rect = rect_expand(rect, radius);
rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
- buffer.use_intensity = write_passes;
+ buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
buffer.passes = buffer.use_intensity? 15 : 14;
buffer.width = rect.z - rect.x;
buffer.stride = align_up(buffer.width, 4);
buffer.h = rect.w - rect.y;
int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
+ buffer.frame_stride = buffer.pass_stride * buffer.passes;
/* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
- int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats);
+ int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
buffer.mem.alloc_to_device(mem_size, false);
+ buffer.use_time = (tile_info->num_frames > 1);
/* CPUs process shifts sequentially while GPUs process them in parallel. */
int num_layers;
@@ -216,6 +225,25 @@ void DenoisingTask::prefilter_color()
}
}
+void DenoisingTask::load_buffer()
+{
+ device_ptr null_ptr = (device_ptr) 0;
+
+ int original_offset = render_buffer.offset;
+
+ int num_passes = buffer.use_intensity? 15 : 14;
+ for(int i = 0; i < tile_info->num_frames; i++) {
+ for(int pass = 0; pass < num_passes; pass++) {
+ device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride);
+ bool is_variance = (pass >= 11) && (pass <= 13);
+ functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f);
+ }
+ render_buffer.offset += render_buffer.frame_stride;
+ }
+
+ render_buffer.offset = original_offset;
+}
+
void DenoisingTask::write_buffer()
{
reconstruction_state.buffer_params = make_int4(target_buffer.offset,
@@ -259,11 +287,17 @@ void DenoisingTask::reconstruct()
device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride);
device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
-
- device_ptr scale_ptr = 0;
- device_sub_ptr *scale_sub_ptr = NULL;
- functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr);
- delete scale_sub_ptr;
+ for(int f = 0; f < tile_info->num_frames; f++) {
+ device_ptr scale_ptr = 0;
+ device_sub_ptr *scale_sub_ptr = NULL;
+ if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
+ scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
+ scale_ptr = **scale_sub_ptr;
+ }
+
+ functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
+ delete scale_sub_ptr;
+ }
functions.solve(target_buffer.ptr);
}
@@ -276,9 +310,14 @@ void DenoisingTask::run_denoising(RenderTile *tile)
setup_denoising_buffer();
- prefilter_shadowing();
- prefilter_features();
- prefilter_color();
+ if(tile_info->from_render) {
+ prefilter_shadowing();
+ prefilter_features();
+ prefilter_color();
+ }
+ else {
+ load_buffer();
+ }
if(do_filter) {
construct_transform();
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index cddcd3bd0c9..5869aa05390 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -38,6 +38,7 @@ public:
struct RenderBuffers {
int offset;
int pass_stride;
+ int frame_stride;
int samples;
} render_buffer;
@@ -70,7 +71,8 @@ public:
)> non_local_means;
function<bool(device_ptr color_ptr,
device_ptr color_variance_ptr,
- device_ptr scale_ptr
+ device_ptr scale_ptr,
+ int frame
)> accumulate;
function<bool(device_ptr output_ptr)> solve;
function<bool()> construct_transform;
@@ -156,8 +158,10 @@ public:
int stride;
int h;
int width;
+ int frame_stride;
device_only_memory<float> mem;
device_only_memory<float> temporary_mem;
+ bool use_time;
bool use_intensity;
bool gpu_temporary_mem;
@@ -179,6 +183,7 @@ protected:
void construct_transform();
void reconstruct();
+ void load_buffer();
void write_buffer();
};
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 97bcde99af6..2871bc5761a 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -73,11 +73,13 @@ public:
float denoising_feature_strength;
bool denoising_relative_pca;
bool denoising_from_render;
+ vector<int> denoising_frames;
bool denoising_do_filter;
bool denoising_write_passes;
int pass_stride;
+ int frame_stride;
int target_pass_stride;
int pass_denoising_data;
int pass_denoising_clean;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 4d42ddc0c53..9b763167459 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -422,6 +422,7 @@ protected:
bool denoising_accumulate(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr scale_ptr,
+ int frame,
DenoisingTask *task);
bool denoising_solve(device_ptr output_ptr,
DenoisingTask *task);
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index a0a1cf68c32..4417065bb7f 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -821,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+ char use_time = task->buffer.use_time? 1 : 0;
cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
- kernel_set_args(ckFilterConstructTransform, 0,
- buffer_mem,
+ int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0,
+ buffer_mem,
+ tile_info_mem);
+ cl_mem buffers[9];
+ for(int i = 0; i < 9; i++) {
+ buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+ arg_ofs += kernel_set_args(ckFilterConstructTransform,
+ arg_ofs,
+ buffers[i]);
+ }
+ kernel_set_args(ckFilterConstructTransform,
+ arg_ofs,
transform_mem,
rank_mem,
task->filter_area,
task->rect,
task->buffer.pass_stride,
+ task->buffer.frame_stride,
+ use_time,
task->radius,
task->pca_threshold);
@@ -845,6 +860,7 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr scale_ptr,
+ int frame,
DenoisingTask *task)
{
cl_mem color_mem = CL_MEM_PTR(color_ptr);
@@ -865,6 +881,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
int w = task->reconstruction_state.source_w;
int h = task->reconstruction_state.source_h;
int stride = task->buffer.stride;
+ int frame_offset = frame * task->buffer.frame_stride;
+ int t = task->tile_info->frames[frame];
+ char use_time = task->buffer.use_time? 1 : 0;
int r = task->radius;
int pass_stride = task->buffer.pass_stride;
@@ -884,6 +903,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
pass_stride,
r,
pass_stride,
+ frame_offset,
1.0f, task->nlm_k_2);
kernel_set_args(ckNLMBlur, 0,
difference_mem,
@@ -898,6 +918,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
pass_stride,
r, 4);
kernel_set_args(ckNLMConstructGramian, 0,
+ t,
blurDifference_mem,
buffer_mem,
transform_mem,
@@ -907,7 +928,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
task->reconstruction_state.filter_window,
w, h, stride,
pass_stride,
- r, 4);
+ r, 4,
+ frame_offset,
+ use_time);
enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
@@ -1108,7 +1131,7 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);