Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@gmail.com>2019-02-06 17:22:53 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2019-02-06 17:22:53 +0300
commite21ae0bb267a54482108ddd4feed99c89241804b (patch)
tree5d5220c578c0e41533a2a4430018ced6ff13e08c /intern/cycles/device
parente8292466bcb69282798bba5dd701fff514cb0b78 (diff)
parentfccf506ed7fd96f8a8f5edda7b99f564a386321a (diff)
Merge branch 'blender2.7'
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/device_cpu.cpp90
-rw-r--r--intern/cycles/device/device_cuda.cpp182
-rw-r--r--intern/cycles/device/device_denoising.cpp119
-rw-r--r--intern/cycles/device/device_denoising.h27
-rw-r--r--intern/cycles/device/device_task.h8
-rw-r--r--intern/cycles/device/opencl/opencl.h16
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp100
7 files changed, 414 insertions, 128 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index a92c052a5df..1f39a412083 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -180,20 +180,21 @@ public:
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel;
- KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
- KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
+ KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)> filter_write_feature_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
- KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
- KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
+ KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -218,6 +219,7 @@ public:
REGISTER_KERNEL(shader),
REGISTER_KERNEL(filter_divide_shadow),
REGISTER_KERNEL(filter_get_feature),
+ REGISTER_KERNEL(filter_write_feature),
REGISTER_KERNEL(filter_detect_outliers),
REGISTER_KERNEL(filter_combine_halves),
REGISTER_KERNEL(filter_nlm_calc_difference),
@@ -487,6 +489,8 @@ public:
int w = align_up(rect.z-rect.x, 4);
int h = rect.w-rect.y;
+ int stride = task->buffer.stride;
+ int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
float *blurDifference = temporary_mem;
@@ -504,10 +508,11 @@ public:
filter_nlm_calc_difference_kernel()(dx, dy,
(float*) guide_ptr,
(float*) variance_ptr,
+ NULL,
difference,
local_rect,
- w, 0,
- a, k_2);
+ w, channel_offset,
+ 0, a, k_2);
filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
@@ -520,7 +525,8 @@ public:
(float*) out_ptr,
weightAccum,
local_rect,
- w, f);
+ channel_offset,
+ stride, f);
}
int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
@@ -536,6 +542,7 @@ public:
for(int y = 0; y < task->filter_area.w; y++) {
for(int x = 0; x < task->filter_area.z; x++) {
filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+ task->tile_info,
x + task->filter_area.x,
y + task->filter_area.y,
y*task->filter_area.z + x,
@@ -543,6 +550,8 @@ public:
(int*) task->storage.rank.device_pointer,
&task->rect.x,
task->buffer.pass_stride,
+ task->buffer.frame_stride,
+ task->buffer.use_time,
task->radius,
task->pca_threshold);
}
@@ -550,21 +559,20 @@ public:
return true;
}
- bool denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ int frame,
+ DenoisingTask *task)
{
ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
- mem_zero(task->storage.XtWX);
- mem_zero(task->storage.XtWY);
-
float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
float *difference = temporary_mem;
float *blurDifference = temporary_mem + task->buffer.pass_stride;
int r = task->radius;
+ int frame_offset = frame * task->buffer.frame_stride;
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
int dy = i / (2*r+1) - r;
int dx = i % (2*r+1) - r;
@@ -575,16 +583,19 @@ public:
filter_nlm_calc_difference_kernel()(dx, dy,
(float*) color_ptr,
(float*) color_variance_ptr,
+ (float*) scale_ptr,
difference,
local_rect,
task->buffer.stride,
task->buffer.pass_stride,
+ frame_offset,
1.0f,
task->nlm_k_2);
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
filter_nlm_construct_gramian_kernel()(dx, dy,
+ task->tile_info->frames[frame],
blurDifference,
(float*) task->buffer.mem.device_pointer,
(float*) task->storage.transform.device_pointer,
@@ -595,8 +606,17 @@ public:
&task->reconstruction_state.filter_window.x,
task->buffer.stride,
4,
- task->buffer.pass_stride);
+ task->buffer.pass_stride,
+ frame_offset,
+ task->buffer.use_time);
}
+
+ return true;
+ }
+
+ bool denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task)
+ {
for(int y = 0; y < task->filter_area.w; y++) {
for(int x = 0; x < task->filter_area.z; x++) {
filter_finalize_kernel()(x,
@@ -661,6 +681,7 @@ public:
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task)
{
ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
@@ -674,6 +695,7 @@ public:
x, y,
(float*) mean_ptr,
(float*) variance_ptr,
+ scale,
&task->rect.x,
task->render_buffer.pass_stride,
task->render_buffer.offset);
@@ -682,6 +704,26 @@ public:
return true;
}
+ bool denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+ {
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_write_feature_kernel()(task->render_buffer.samples,
+ x + task->filter_area.x,
+ y + task->filter_area.y,
+ &task->reconstruction_state.buffer_params.x,
+ (float*) from_ptr,
+ (float*) buffer_ptr,
+ out_offset,
+ &task->rect.x);
+ }
+ }
+ return true;
+ }
+
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
@@ -754,11 +796,13 @@ public:
tile.sample = tile.start_sample + tile.num_samples;
denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 67f5793e793..ada538adf32 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1300,7 +1300,8 @@ public:
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
- int channel_offset = 0;
+ int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
+ int frame_offset = 0;
if(have_error())
return false;
@@ -1308,6 +1309,7 @@ public:
CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts;
+ CUdeviceptr scale_ptr = 0;
cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride));
cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride));
@@ -1326,10 +1328,10 @@ public:
CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
- void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
+ void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2};
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &r, &f};
+ void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1366,32 +1368,33 @@ public:
task->storage.h);
void *args[] = {&task->buffer.mem.device_pointer,
+ &task->tile_info_mem.device_pointer,
&task->storage.transform.device_pointer,
&task->storage.rank.device_pointer,
&task->filter_area,
&task->rect,
&task->radius,
&task->pca_threshold,
- &task->buffer.pass_stride};
+ &task->buffer.pass_stride,
+ &task->buffer.frame_stride,
+ &task->buffer.use_time};
CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
cuda_assert(cuCtxSynchronize());
return !have_error();
}
- bool denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ int frame,
+ DenoisingTask *task)
{
if(have_error())
return false;
CUDAContextScope scope(this);
- mem_zero(task->storage.XtWX);
- mem_zero(task->storage.XtWY);
-
int r = task->radius;
int f = 4;
float a = 1.0f;
@@ -1400,6 +1403,8 @@ public:
int w = task->reconstruction_state.source_w;
int h = task->reconstruction_state.source_h;
int stride = task->buffer.stride;
+ int frame_offset = frame * task->buffer.frame_stride;
+ int t = task->tile_info->frames[frame];
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
@@ -1410,60 +1415,73 @@ public:
CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
- {
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
- cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
- CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
- task->reconstruction_state.source_w * task->reconstruction_state.source_h,
- num_shifts);
-
- void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2};
- void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
- void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *construct_gramian_args[] = {&blurDifference,
- &task->buffer.mem.device_pointer,
- &task->storage.transform.device_pointer,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->reconstruction_state.filter_window,
- &w, &h, &stride,
- &pass_stride, &r,
- &f};
-
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
- }
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+ CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+ task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+ num_shifts);
+
+ void *calc_difference_args[] = {&color_ptr,
+ &color_variance_ptr,
+ &scale_ptr,
+ &difference,
+ &w, &h,
+ &stride, &pass_stride,
+ &r, &pass_stride,
+ &frame_offset,
+ &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *construct_gramian_args[] = {&t,
+ &blurDifference,
+ &task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.filter_window,
+ &w, &h, &stride,
+ &pass_stride, &r,
+ &f,
+ &frame_offset,
+ &task->buffer.use_time};
+
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+ cuda_assert(cuCtxSynchronize());
- {
- CUfunction cuFinalize;
- cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
- cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
- void *finalize_args[] = {&output_ptr,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->filter_area,
- &task->reconstruction_state.buffer_params.x,
- &task->render_buffer.samples};
- CUDA_GET_BLOCKSIZE(cuFinalize,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
- CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
- }
+ return !have_error();
+ }
+ bool denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ CUfunction cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+ void *finalize_args[] = {&output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_GET_BLOCKSIZE(cuFinalize,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
cuda_assert(cuCtxSynchronize());
return !have_error();
@@ -1533,6 +1551,7 @@ public:
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task)
{
if(have_error())
@@ -1553,6 +1572,7 @@ public:
&variance_offset,
&mean_ptr,
&variance_ptr,
+ &scale,
&task->rect,
&task->render_buffer.pass_stride,
&task->render_buffer.offset};
@@ -1562,6 +1582,36 @@ public:
return !have_error();
}
+ bool denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterWriteFeature;
+ cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterWriteFeature,
+ task->filter_area.z,
+ task->filter_area.w);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->reconstruction_state.buffer_params,
+ &task->filter_area,
+ &from_ptr,
+ &buffer_ptr,
+ &out_offset,
+ &task->rect};
+ CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
@@ -1596,11 +1646,13 @@ public:
void denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
- denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 433cbd3c265..61e0ba47ab8 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -36,14 +36,28 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
}
+ render_buffer.frame_stride = task.frame_stride;
render_buffer.pass_stride = task.pass_stride;
render_buffer.offset = task.pass_denoising_data;
- target_buffer.pass_stride = task.pass_stride;
+ target_buffer.pass_stride = task.target_pass_stride;
target_buffer.denoising_clean_offset = task.pass_denoising_clean;
+ target_buffer.offset = 0;
functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
+
+ tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
+ tile_info->from_render = task.denoising_from_render? 1 : 0;
+
+ tile_info->frames[0] = 0;
+ tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
+ for(int i = 1; i < tile_info->num_frames; i++) {
+ tile_info->frames[i] = task.denoising_frames[i-1];
+ }
+
+ write_passes = task.denoising_write_passes;
+ do_filter = task.denoising_do_filter;
}
DenoisingTask::~DenoisingTask()
@@ -59,8 +73,6 @@ DenoisingTask::~DenoisingTask()
void DenoisingTask::set_render_buffer(RenderTile *rtiles)
{
- tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
-
for(int i = 0; i < 9; i++) {
tile_info->offsets[i] = rtiles[i].offset;
tile_info->strides[i] = rtiles[i].stride;
@@ -79,6 +91,13 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
target_buffer.stride = rtiles[9].stride;
target_buffer.ptr = rtiles[9].buffer;
+ if(write_passes && rtiles[9].buffers) {
+ target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset();
+ }
+ else {
+ target_buffer.denoising_output_offset = 0;
+ }
+
tile_info_mem.copy_to_device();
}
@@ -89,15 +108,18 @@ void DenoisingTask::setup_denoising_buffer()
rect = rect_expand(rect, radius);
rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
- buffer.passes = 14;
+ buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
+ buffer.passes = buffer.use_intensity? 15 : 14;
buffer.width = rect.z - rect.x;
buffer.stride = align_up(buffer.width, 4);
buffer.h = rect.w - rect.y;
int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
+ buffer.frame_stride = buffer.pass_stride * buffer.passes;
/* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
- int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats);
+ int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
buffer.mem.alloc_to_device(mem_size, false);
+ buffer.use_time = (tile_info->num_frames > 1);
/* CPUs process shifts sequentially while GPUs process them in parallel. */
int num_layers;
@@ -129,14 +151,14 @@ void DenoisingTask::prefilter_shadowing()
functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+ nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
/* Reuse memory, the previous data isn't needed anymore. */
device_ptr filtered_a = *buffer_var,
filtered_b = *sample_var;
/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
- nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+ nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
@@ -147,7 +169,7 @@ void DenoisingTask::prefilter_shadowing()
device_ptr final_a = *unfiltered_a,
final_b = *unfiltered_b;
/* Use the residual variance for a second filter pass. */
- nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+ nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
@@ -167,9 +189,9 @@ void DenoisingTask::prefilter_features()
for(int pass = 0; pass < 7; pass++) {
device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride);
/* Get the unfiltered pass and its variance from the RenderBuffers. */
- functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+ functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples);
/* Smooth the pass and store the result in the denoising buffers. */
- nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+ nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
}
}
@@ -188,13 +210,52 @@ void DenoisingTask::prefilter_color()
for(int pass = 0; pass < num_color_passes; pass++) {
device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride);
device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride);
- functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+ functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples);
}
device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride);
device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+
+ if(buffer.use_intensity) {
+ device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
+ nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true);
+ functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
+ }
+}
+
+void DenoisingTask::load_buffer()
+{
+ device_ptr null_ptr = (device_ptr) 0;
+
+ int original_offset = render_buffer.offset;
+
+ int num_passes = buffer.use_intensity? 15 : 14;
+ for(int i = 0; i < tile_info->num_frames; i++) {
+ for(int pass = 0; pass < num_passes; pass++) {
+ device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride);
+ bool is_variance = (pass >= 11) && (pass <= 13);
+ functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f);
+ }
+ render_buffer.offset += render_buffer.frame_stride;
+ }
+
+ render_buffer.offset = original_offset;
+}
+
+void DenoisingTask::write_buffer()
+{
+ reconstruction_state.buffer_params = make_int4(target_buffer.offset,
+ target_buffer.stride,
+ target_buffer.pass_stride,
+ target_buffer.denoising_clean_offset);
+ int num_passes = buffer.use_intensity? 15 : 14;
+ for(int pass = 0; pass < num_passes; pass++) {
+ device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride);
+ int out_offset = pass + target_buffer.denoising_output_offset;
+ functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
+ }
}
void DenoisingTask::construct_transform()
@@ -212,6 +273,8 @@ void DenoisingTask::reconstruct()
{
storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
+ storage.XtWX.zero_to_device();
+ storage.XtWY.zero_to_device();
reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x;
@@ -224,7 +287,18 @@ void DenoisingTask::reconstruct()
device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride);
device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
- functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr);
+ for(int f = 0; f < tile_info->num_frames; f++) {
+ device_ptr scale_ptr = 0;
+ device_sub_ptr *scale_sub_ptr = NULL;
+ if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
+ scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
+ scale_ptr = **scale_sub_ptr;
+ }
+
+ functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
+ delete scale_sub_ptr;
+ }
+ functions.solve(target_buffer.ptr);
}
void DenoisingTask::run_denoising(RenderTile *tile)
@@ -236,12 +310,23 @@ void DenoisingTask::run_denoising(RenderTile *tile)
setup_denoising_buffer();
- prefilter_shadowing();
- prefilter_features();
- prefilter_color();
+ if(tile_info->from_render) {
+ prefilter_shadowing();
+ prefilter_features();
+ prefilter_color();
+ }
+ else {
+ load_buffer();
+ }
+
+ if(do_filter) {
+ construct_transform();
+ reconstruct();
+ }
- construct_transform();
- reconstruct();
+ if(write_passes) {
+ write_buffer();
+ }
functions.unmap_neighbor_tiles(rtiles);
}
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index beae60c220f..5869aa05390 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -38,6 +38,7 @@ public:
struct RenderBuffers {
int offset;
int pass_stride;
+ int frame_stride;
int samples;
} render_buffer;
@@ -47,6 +48,7 @@ public:
int stride;
int pass_stride;
int denoising_clean_offset;
+ int denoising_output_offset;
device_ptr ptr;
} target_buffer;
@@ -58,6 +60,9 @@ public:
int4 rect;
int4 filter_area;
+ bool write_passes;
+ bool do_filter;
+
struct DeviceFunctions {
function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
@@ -66,8 +71,10 @@ public:
)> non_local_means;
function<bool(device_ptr color_ptr,
device_ptr color_variance_ptr,
- device_ptr output_ptr
- )> reconstruct;
+ device_ptr scale_ptr,
+ int frame
+ )> accumulate;
+ function<bool(device_ptr output_ptr)> solve;
function<bool()> construct_transform;
function<bool(device_ptr a_ptr,
@@ -86,13 +93,18 @@ public:
function<bool(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
- device_ptr variance_ptr
+ device_ptr variance_ptr,
+ float scale
)> get_feature;
function<bool(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
device_ptr output_ptr
)> detect_outliers;
+ function<bool(int out_offset,
+ device_ptr frop_ptr,
+ device_ptr buffer_ptr
+ )> write_feature;
function<void(RenderTile *rtiles)> map_neighbor_tiles;
function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
} functions;
@@ -114,8 +126,9 @@ public:
int f; /* Patch size of the filter. */
float a; /* Variance compensation factor in the MSE estimation. */
float k_2; /* Squared value of the k parameter of the filter. */
+ bool is_color;
- void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+ void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; }
} nlm_state;
struct Storage {
@@ -145,8 +158,11 @@ public:
int stride;
int h;
int width;
+ int frame_stride;
device_only_memory<float> mem;
device_only_memory<float> temporary_mem;
+ bool use_time;
+ bool use_intensity;
bool gpu_temporary_mem;
@@ -166,6 +182,9 @@ protected:
void prefilter_color();
void construct_transform();
void reconstruct();
+
+ void load_buffer();
+ void write_buffer();
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 861014373b3..2871bc5761a 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -72,7 +72,15 @@ public:
float denoising_strength;
float denoising_feature_strength;
bool denoising_relative_pca;
+ bool denoising_from_render;
+ vector<int> denoising_frames;
+
+ bool denoising_do_filter;
+ bool denoising_write_passes;
+
int pass_stride;
+ int frame_stride;
+ int target_pass_stride;
int pass_denoising_data;
int pass_denoising_clean;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index ea7ed4f1909..9b763167459 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -419,10 +419,13 @@ protected:
device_ptr out_ptr,
DenoisingTask *task);
bool denoising_construct_transform(DenoisingTask *task);
- bool denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task);
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ int frame,
+ DenoisingTask *task);
+ bool denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task);
bool denoising_combine_halves(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr mean_ptr,
@@ -439,7 +442,12 @@ protected:
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task);
+ bool denoising_write_feature(int to_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task);
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index d4d7c0f74bc..4417065bb7f 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -748,6 +748,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
+ int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
@@ -760,6 +761,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
cl_mem out_mem = CL_MEM_PTR(out_ptr);
+ cl_mem scale_mem = NULL;
mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride);
mem_zero_kernel(out_ptr, sizeof(float)*pass_stride);
@@ -773,10 +775,12 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
kernel_set_args(ckNLMCalcDifference, 0,
guide_mem,
variance_mem,
+ scale_mem,
difference_mem,
w, h, stride,
pass_stride,
- r, 0, a, k_2);
+ r, channel_offset,
+ 0, a, k_2);
kernel_set_args(ckNLMBlur, 0,
difference_mem,
blurDifference_mem,
@@ -796,6 +800,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
weightAccum_mem,
w, h, stride,
pass_stride,
+ channel_offset,
r, f);
enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
@@ -816,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+ char use_time = task->buffer.use_time? 1 : 0;
cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
- kernel_set_args(ckFilterConstructTransform, 0,
- buffer_mem,
+ int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0,
+ buffer_mem,
+ tile_info_mem);
+ cl_mem buffers[9];
+ for(int i = 0; i < 9; i++) {
+ buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+ arg_ofs += kernel_set_args(ckFilterConstructTransform,
+ arg_ofs,
+ buffers[i]);
+ }
+ kernel_set_args(ckFilterConstructTransform,
+ arg_ofs,
transform_mem,
rank_mem,
task->filter_area,
task->rect,
task->buffer.pass_stride,
+ task->buffer.frame_stride,
+ use_time,
task->radius,
task->pca_threshold);
@@ -837,17 +857,15 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
return true;
}
-bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
+bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ int frame,
+ DenoisingTask *task)
{
- mem_zero(task->storage.XtWX);
- mem_zero(task->storage.XtWY);
-
cl_mem color_mem = CL_MEM_PTR(color_ptr);
cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
- cl_mem output_mem = CL_MEM_PTR(output_ptr);
+ cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
@@ -859,11 +877,13 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
- cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
int w = task->reconstruction_state.source_w;
int h = task->reconstruction_state.source_h;
int stride = task->buffer.stride;
+ int frame_offset = frame * task->buffer.frame_stride;
+ int t = task->tile_info->frames[frame];
+ char use_time = task->buffer.use_time? 1 : 0;
int r = task->radius;
int pass_stride = task->buffer.pass_stride;
@@ -877,11 +897,13 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
kernel_set_args(ckNLMCalcDifference, 0,
color_mem,
color_variance_mem,
+ scale_mem,
difference_mem,
w, h, stride,
pass_stride,
r,
pass_stride,
+ frame_offset,
1.0f, task->nlm_k_2);
kernel_set_args(ckNLMBlur, 0,
difference_mem,
@@ -896,6 +918,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
pass_stride,
r, 4);
kernel_set_args(ckNLMConstructGramian, 0,
+ t,
blurDifference_mem,
buffer_mem,
transform_mem,
@@ -905,7 +928,9 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
task->reconstruction_state.filter_window,
w, h, stride,
pass_stride,
- r, 4);
+ r, 4,
+ frame_offset,
+ use_time);
enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
@@ -913,6 +938,22 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_solve(device_ptr output_ptr,
+ DenoisingTask *task)
+{
+ cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+ cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+ int w = task->reconstruction_state.source_w;
+ int h = task->reconstruction_state.source_h;
+
kernel_set_args(ckFinalize, 0,
output_mem,
rank_mem,
@@ -1000,6 +1041,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
int variance_offset,
device_ptr mean_ptr,
device_ptr variance_ptr,
+ float scale,
DenoisingTask *task)
{
cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
@@ -1023,6 +1065,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
variance_offset,
mean_mem,
variance_mem,
+ scale,
task->rect,
task->render_buffer.pass_stride,
task->render_buffer.offset);
@@ -1033,6 +1076,31 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
return true;
}
+bool OpenCLDeviceBase::denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+{
+ cl_mem from_mem = CL_MEM_PTR(from_ptr);
+ cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
+
+ cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
+
+ kernel_set_args(ckFilterWriteFeature, 0,
+ task->render_buffer.samples,
+ task->reconstruction_state.buffer_params,
+ task->filter_area,
+ from_mem,
+ buffer_mem,
+ out_offset,
+ task->rect);
+ enqueue_kernel(ckFilterWriteFeature,
+ task->filter_area.z,
+ task->filter_area.w);
+
+ return true;
+}
+
bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
@@ -1063,11 +1131,13 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
- denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(&OpenCLDeviceBase::denoising_write_feature, this, _1, _2, _3, &denoising);
denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);