Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/device/device_cpu.cpp23
-rw-r--r--intern/cycles/device/device_cuda.cpp21
-rw-r--r--intern/cycles/device/device_denoising.cpp59
-rw-r--r--intern/cycles/device/device_denoising.h7
-rw-r--r--intern/cycles/device/device_task.h2
-rw-r--r--intern/cycles/device/opencl/opencl.h1
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp31
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h6
-rw-r--r--intern/cycles/kernel/filter/filter_features.h77
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h53
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h13
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h11
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h12
-rw-r--r--intern/cycles/kernel/filter/filter_transform.h59
-rw-r--r--intern/cycles/kernel/filter/filter_transform_gpu.h54
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h60
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h9
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h19
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu25
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl20
20 files changed, 388 insertions, 174 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 6668acc9cbe..93c63b92a55 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -186,15 +186,15 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel;
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel;
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
- KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
- KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
+ KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -512,7 +512,7 @@ public:
difference,
local_rect,
w, channel_offset,
- a, k_2);
+ 0, a, k_2);
filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
@@ -542,6 +542,7 @@ public:
for(int y = 0; y < task->filter_area.w; y++) {
for(int x = 0; x < task->filter_area.z; x++) {
filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+ task->tile_info,
x + task->filter_area.x,
y + task->filter_area.y,
y*task->filter_area.z + x,
@@ -549,6 +550,8 @@ public:
(int*) task->storage.rank.device_pointer,
&task->rect.x,
task->buffer.pass_stride,
+ task->buffer.frame_stride,
+ task->buffer.use_time,
task->radius,
task->pca_threshold);
}
@@ -559,6 +562,7 @@ public:
bool denoising_accumulate(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr scale_ptr,
+ int frame,
DenoisingTask *task)
{
ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
@@ -568,6 +572,7 @@ public:
float *blurDifference = temporary_mem + task->buffer.pass_stride;
int r = task->radius;
+ int frame_offset = frame * task->buffer.frame_stride;
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
int dy = i / (2*r+1) - r;
int dx = i % (2*r+1) - r;
@@ -583,12 +588,14 @@ public:
local_rect,
task->buffer.stride,
task->buffer.pass_stride,
+ frame_offset,
1.0f,
task->nlm_k_2);
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
filter_nlm_construct_gramian_kernel()(dx, dy,
+ task->tile_info->frames[frame],
blurDifference,
(float*) task->buffer.mem.device_pointer,
(float*) task->storage.transform.device_pointer,
@@ -599,7 +606,9 @@ public:
&task->reconstruction_state.filter_window.x,
task->buffer.stride,
4,
- task->buffer.pass_stride);
+ task->buffer.pass_stride,
+ frame_offset,
+ task->buffer.use_time);
}
return true;
@@ -787,7 +796,7 @@ public:
tile.sample = tile.start_sample + tile.num_samples;
denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index cb7d8bbb224..e21d974ebbe 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1301,6 +1301,7 @@ public:
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
+ int frame_offset = 0;
if(have_error())
return false;
@@ -1327,7 +1328,7 @@ public:
CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
- void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
+ void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2};
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
@@ -1367,13 +1368,16 @@ public:
task->storage.h);
void *args[] = {&task->buffer.mem.device_pointer,
+ &task->tile_info_mem.device_pointer,
&task->storage.transform.device_pointer,
&task->storage.rank.device_pointer,
&task->filter_area,
&task->rect,
&task->radius,
&task->pca_threshold,
- &task->buffer.pass_stride};
+ &task->buffer.pass_stride,
+ &task->buffer.frame_stride,
+ &task->buffer.use_time};
CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
cuda_assert(cuCtxSynchronize());
@@ -1383,6 +1387,7 @@ public:
bool denoising_accumulate(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr scale_ptr,
+ int frame,
DenoisingTask *task)
{
if(have_error())
@@ -1398,6 +1403,8 @@ public:
int w = task->reconstruction_state.source_w;
int h = task->reconstruction_state.source_h;
int stride = task->buffer.stride;
+ int frame_offset = frame * task->buffer.frame_stride;
+ int t = task->tile_info->frames[frame];
int pass_stride = task->buffer.pass_stride;
int num_shifts = (2*r+1)*(2*r+1);
@@ -1430,10 +1437,12 @@ public:
&w, &h,
&stride, &pass_stride,
&r, &pass_stride,
+ &frame_offset,
&a, &k_2};
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *construct_gramian_args[] = {&blurDifference,
+ void *construct_gramian_args[] = {&t,
+ &blurDifference,
&task->buffer.mem.device_pointer,
&task->storage.transform.device_pointer,
&task->storage.rank.device_pointer,
@@ -1442,7 +1451,9 @@ public:
&task->reconstruction_state.filter_window,
&w, &h, &stride,
&pass_stride, &r,
- &f};
+ &f,
+ &frame_offset,
+ &task->buffer.use_time};
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1635,7 +1646,7 @@ public:
void denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 724171c3acb..61e0ba47ab8 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -36,6 +36,7 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
}
+ render_buffer.frame_stride = task.frame_stride;
render_buffer.pass_stride = task.pass_stride;
render_buffer.offset = task.pass_denoising_data;
@@ -49,6 +50,12 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
tile_info->from_render = task.denoising_from_render? 1 : 0;
+ tile_info->frames[0] = 0;
+ tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
+ for(int i = 1; i < tile_info->num_frames; i++) {
+ tile_info->frames[i] = task.denoising_frames[i-1];
+ }
+
write_passes = task.denoising_write_passes;
do_filter = task.denoising_do_filter;
}
@@ -101,16 +108,18 @@ void DenoisingTask::setup_denoising_buffer()
rect = rect_expand(rect, radius);
rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
- buffer.use_intensity = write_passes;
+ buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
buffer.passes = buffer.use_intensity? 15 : 14;
buffer.width = rect.z - rect.x;
buffer.stride = align_up(buffer.width, 4);
buffer.h = rect.w - rect.y;
int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
+ buffer.frame_stride = buffer.pass_stride * buffer.passes;
/* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
- int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats);
+ int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
buffer.mem.alloc_to_device(mem_size, false);
+ buffer.use_time = (tile_info->num_frames > 1);
/* CPUs process shifts sequentially while GPUs process them in parallel. */
int num_layers;
@@ -216,6 +225,25 @@ void DenoisingTask::prefilter_color()
}
}
+void DenoisingTask::load_buffer()
+{
+ device_ptr null_ptr = (device_ptr) 0;
+
+ int original_offset = render_buffer.offset;
+
+ int num_passes = buffer.use_intensity? 15 : 14;
+ for(int i = 0; i < tile_info->num_frames; i++) {
+ for(int pass = 0; pass < num_passes; pass++) {
+ device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride);
+ bool is_variance = (pass >= 11) && (pass <= 13);
+ functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f);
+ }
+ render_buffer.offset += render_buffer.frame_stride;
+ }
+
+ render_buffer.offset = original_offset;
+}
+
void DenoisingTask::write_buffer()
{
reconstruction_state.buffer_params = make_int4(target_buffer.offset,
@@ -259,11 +287,17 @@ void DenoisingTask::reconstruct()
device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride);
device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
-
- device_ptr scale_ptr = 0;
- device_sub_ptr *scale_sub_ptr = NULL;
- functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr);
- delete scale_sub_ptr;
+ for(int f = 0; f < tile_info->num_frames; f++) {
+ device_ptr scale_ptr = 0;
+ device_sub_ptr *scale_sub_ptr = NULL;
+ if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
+ scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
+ scale_ptr = **scale_sub_ptr;
+ }
+
+ functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
+ delete scale_sub_ptr;
+ }
functions.solve(target_buffer.ptr);
}
@@ -276,9 +310,14 @@ void DenoisingTask::run_denoising(RenderTile *tile)
setup_denoising_buffer();
- prefilter_shadowing();
- prefilter_features();
- prefilter_color();
+ if(tile_info->from_render) {
+ prefilter_shadowing();
+ prefilter_features();
+ prefilter_color();
+ }
+ else {
+ load_buffer();
+ }
if(do_filter) {
construct_transform();
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index cddcd3bd0c9..5869aa05390 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -38,6 +38,7 @@ public:
struct RenderBuffers {
int offset;
int pass_stride;
+ int frame_stride;
int samples;
} render_buffer;
@@ -70,7 +71,8 @@ public:
)> non_local_means;
function<bool(device_ptr color_ptr,
device_ptr color_variance_ptr,
- device_ptr scale_ptr
+ device_ptr scale_ptr,
+ int frame
)> accumulate;
function<bool(device_ptr output_ptr)> solve;
function<bool()> construct_transform;
@@ -156,8 +158,10 @@ public:
int stride;
int h;
int width;
+ int frame_stride;
device_only_memory<float> mem;
device_only_memory<float> temporary_mem;
+ bool use_time;
bool use_intensity;
bool gpu_temporary_mem;
@@ -179,6 +183,7 @@ protected:
void construct_transform();
void reconstruct();
+ void load_buffer();
void write_buffer();
};
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 97bcde99af6..2871bc5761a 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -73,11 +73,13 @@ public:
float denoising_feature_strength;
bool denoising_relative_pca;
bool denoising_from_render;
+ vector<int> denoising_frames;
bool denoising_do_filter;
bool denoising_write_passes;
int pass_stride;
+ int frame_stride;
int target_pass_stride;
int pass_denoising_data;
int pass_denoising_clean;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 4d42ddc0c53..9b763167459 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -422,6 +422,7 @@ protected:
bool denoising_accumulate(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr scale_ptr,
+ int frame,
DenoisingTask *task);
bool denoising_solve(device_ptr output_ptr,
DenoisingTask *task);
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index a0a1cf68c32..4417065bb7f 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -821,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+ char use_time = task->buffer.use_time? 1 : 0;
cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
- kernel_set_args(ckFilterConstructTransform, 0,
- buffer_mem,
+ int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0,
+ buffer_mem,
+ tile_info_mem);
+ cl_mem buffers[9];
+ for(int i = 0; i < 9; i++) {
+ buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+ arg_ofs += kernel_set_args(ckFilterConstructTransform,
+ arg_ofs,
+ buffers[i]);
+ }
+ kernel_set_args(ckFilterConstructTransform,
+ arg_ofs,
transform_mem,
rank_mem,
task->filter_area,
task->rect,
task->buffer.pass_stride,
+ task->buffer.frame_stride,
+ use_time,
task->radius,
task->pca_threshold);
@@ -845,6 +860,7 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
device_ptr color_variance_ptr,
device_ptr scale_ptr,
+ int frame,
DenoisingTask *task)
{
cl_mem color_mem = CL_MEM_PTR(color_ptr);
@@ -865,6 +881,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
int w = task->reconstruction_state.source_w;
int h = task->reconstruction_state.source_h;
int stride = task->buffer.stride;
+ int frame_offset = frame * task->buffer.frame_stride;
+ int t = task->tile_info->frames[frame];
+ char use_time = task->buffer.use_time? 1 : 0;
int r = task->radius;
int pass_stride = task->buffer.pass_stride;
@@ -884,6 +903,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
pass_stride,
r,
pass_stride,
+ frame_offset,
1.0f, task->nlm_k_2);
kernel_set_args(ckNLMBlur, 0,
difference_mem,
@@ -898,6 +918,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
pass_stride,
r, 4);
kernel_set_args(ckNLMConstructGramian, 0,
+ t,
blurDifference_mem,
buffer_mem,
transform_mem,
@@ -907,7 +928,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
task->reconstruction_state.filter_window,
w, h, stride,
pass_stride,
- r, 4);
+ r, 4,
+ frame_offset,
+ use_time);
enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
@@ -1108,7 +1131,7 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
{
denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising);
+ denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising);
denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index 9ac7c3db23d..cb04aac35f4 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -17,17 +17,21 @@
#ifndef __FILTER_DEFINES_H__
#define __FILTER_DEFINES_H__
-#define DENOISE_FEATURES 10
+#define DENOISE_FEATURES 11
#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
#define XTWY_SIZE (DENOISE_FEATURES+1)
+#define DENOISE_MAX_FRAMES 16
+
typedef struct TileInfo {
int offsets[9];
int strides[9];
int x[4];
int y[4];
int from_render;
+ int frames[DENOISE_MAX_FRAMES];
+ int num_frames;
/* TODO(lukas): CUDA doesn't have uint64_t... */
#ifdef __KERNEL_OPENCL__
ccl_global float *buffers[9];
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index 6226ed2c2ef..e1ea6487aa9 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -18,19 +18,23 @@
#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
- * pixel_buffer always points to the current pixel in the first pass. */
-#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
- for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass.
+ * Repeat the loop for every secondary frame if there are any. */
+#define FOR_PIXEL_WINDOW for(int frame = 0; frame < tile_info->num_frames; frame++) { \
+ pixel.z = tile_info->frames[frame]; \
+ pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
-#define END_FOR_PIXEL_WINDOW } \
- pixel_buffer += buffer_w - (high.x - low.x); \
+#define END_FOR_PIXEL_WINDOW } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ } \
}
-ccl_device_inline void filter_get_features(int2 pixel,
+ccl_device_inline void filter_get_features(int3 pixel,
const ccl_global float *ccl_restrict buffer,
float *features,
+ bool use_time,
const float *ccl_restrict mean,
int pass_stride)
{
@@ -44,15 +48,20 @@ ccl_device_inline void filter_get_features(int2 pixel,
features[7] = ccl_get_feature(buffer, 5);
features[8] = ccl_get_feature(buffer, 6);
features[9] = ccl_get_feature(buffer, 7);
+ if(use_time) {
+ features[10] = pixel.z;
+ }
if(mean) {
- for(int i = 0; i < DENOISE_FEATURES; i++)
+ for(int i = 0; i < (use_time? 11 : 10); i++) {
features[i] -= mean[i];
+ }
}
}
-ccl_device_inline void filter_get_feature_scales(int2 pixel,
+ccl_device_inline void filter_get_feature_scales(int3 pixel,
const ccl_global float *ccl_restrict buffer,
float *scales,
+ bool use_time,
const float *ccl_restrict mean,
int pass_stride)
{
@@ -66,13 +75,19 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel,
scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
ccl_get_feature(buffer, 6) - mean[8],
ccl_get_feature(buffer, 7) - mean[9]));
+ if(use_time) {
+ scales[6] = fabsf(pixel.z - mean[10]);
+ }
}
-ccl_device_inline void filter_calculate_scale(float *scale)
+ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
{
scale[0] = 1.0f/max(scale[0], 0.01f);
scale[1] = 1.0f/max(scale[1], 0.01f);
scale[2] = 1.0f/max(scale[2], 0.01f);
+ if(use_time) {
+ scale[10] = 1.0f/max(scale[6], 0.01f);
+ }
scale[6] = 1.0f/max(scale[4], 0.01f);
scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
@@ -89,36 +104,46 @@ ccl_device_inline void design_row_add(float *design_row,
const ccl_global float *ccl_restrict transform,
int stride,
int row,
- float feature)
+ float feature,
+ int transform_row_stride)
{
for(int i = 0; i < rank; i++) {
- design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+ design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature;
}
}
/* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ccl_device_inline void filter_get_design_row_transform(int3 p_pixel,
const ccl_global float *ccl_restrict p_buffer,
- int2 q_pixel,
+ int3 q_pixel,
const ccl_global float *ccl_restrict q_buffer,
int pass_stride,
int rank,
float *design_row,
const ccl_global float *ccl_restrict transform,
- int stride)
+ int stride,
+ bool use_time)
{
+ int num_features = use_time? 11 : 10;
+
design_row[0] = 1.0f;
math_vector_zero(design_row+1, rank);
- design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
- design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
- design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
- design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
- design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
- design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
- design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
- design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
- design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
- design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+
+#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features);
+ DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
+ DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
+ DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+ DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+ DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+ DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+ DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+ DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+ DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+ DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+ if(use_time) {
+ DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
+ }
+#undef DESIGN_ROW_ADD
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 3ddd8712266..5dd001ffb93 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -20,26 +20,33 @@ CCL_NAMESPACE_BEGIN
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
* pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window.
+ * Repeat the loop for every secondary frame if there are any. */
+#define FOR_PIXEL_WINDOW_SSE for(int frame = 0; frame < tile_info->num_frames; frame++) { \
+ pixel.z = tile_info->frames[frame]; \
+ pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
+ float4 t4 = make_float4(pixel.z); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ float4 y4 = make_float4(pixel.y); \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+ float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+ int4 active_pixels = x4 < make_float4(high.x);
-#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
- for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- float4 y4 = make_float4(pixel.y); \
- for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
- float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
- int4 active_pixels = x4 < make_float4(high.x);
-
-#define END_FOR_PIXEL_WINDOW_SSE } \
- pixel_buffer += buffer_w - (pixel.x - low.x); \
+#define END_FOR_PIXEL_WINDOW_SSE } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ } \
}
-ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t,
int4 active_pixels,
const float *ccl_restrict buffer,
float4 *features,
+ bool use_time,
const float4 *ccl_restrict mean,
int pass_stride)
{
+ int num_features = use_time? 11 : 10;
+
features[0] = x;
features[1] = y;
features[2] = fabs(ccl_get_feature_sse(0));
@@ -50,18 +57,25 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
features[7] = ccl_get_feature_sse(5);
features[8] = ccl_get_feature_sse(6);
features[9] = ccl_get_feature_sse(7);
+ if(use_time) {
+ features[10] = t;
+ }
+
if(mean) {
- for(int i = 0; i < DENOISE_FEATURES; i++)
+ for(int i = 0; i < num_features; i++) {
features[i] = features[i] - mean[i];
+ }
}
- for(int i = 0; i < DENOISE_FEATURES; i++)
+ for(int i = 0; i < num_features; i++) {
features[i] = mask(active_pixels, features[i]);
+ }
}
-ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t,
int4 active_pixels,
const float *ccl_restrict buffer,
float4 *scales,
+ bool use_time,
const float4 *ccl_restrict mean,
int pass_stride)
{
@@ -75,15 +89,22 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
sqr(ccl_get_feature_sse(6) - mean[8]) +
sqr(ccl_get_feature_sse(7) - mean[9]);
- for(int i = 0; i < 6; i++)
+ if(use_time) {
+ scales[6] = fabs(t - mean[10]);
+ }
+
+ for(int i = 0; i < (use_time? 7 : 6); i++)
scales[i] = mask(active_pixels, scales[i]);
}
-ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
{
scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+ if(use_time) {
+ scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));;
+ }
scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 0c4387af540..9eb3c603a4a 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -27,6 +27,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
int4 rect,
int stride,
int channel_offset,
+ int frame_offset,
float a,
float k_2)
{
@@ -39,7 +40,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
for(int y = rect.y; y < rect.w; y++) {
int idx_p = y*stride + aligned_lowx;
- int idx_q = (y+dy)*stride + aligned_lowx + dx;
+ int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset;
for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
float4 diff = make_float4(0.0f);
float4 scale_fac;
@@ -181,7 +182,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
}
}
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t,
const float *ccl_restrict difference_image,
const float *ccl_restrict buffer,
float *transform,
@@ -191,7 +192,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
int4 rect,
int4 filter_window,
int stride, int f,
- int pass_stride)
+ int pass_stride,
+ int frame_offset,
+ bool use_time)
{
int4 clip_area = rect_clip(rect, filter_window);
/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
@@ -212,9 +215,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
int *l_rank = rank + storage_ofs;
kernel_filter_construct_gramian(x, y, 1,
- dx, dy,
+ dx, dy, t,
stride,
pass_stride,
+ frame_offset,
+ use_time,
buffer,
l_transform, l_rank,
weight, l_XtWX, l_XtWY, 0);
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index d8e2e4d08aa..12636393243 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -82,9 +82,10 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
ccl_global float *difference_image,
int4 rect, int stride,
int channel_offset,
+ int frame_offset,
float a, float k_2)
{
- int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
+ int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset;
int numChannels = channel_offset? 3 : 1;
float diff = 0.0f;
@@ -170,7 +171,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
}
ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
- int dx, int dy,
+ int dx, int dy, int t,
const ccl_global float *ccl_restrict difference_image,
const ccl_global float *ccl_restrict buffer,
const ccl_global float *ccl_restrict transform,
@@ -181,6 +182,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
int4 filter_window,
int stride, int f,
int pass_stride,
+ int frame_offset,
+ bool use_time,
int localIdx)
{
const int low = max(rect.x, x-f);
@@ -201,9 +204,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
kernel_filter_construct_gramian(x, y,
rect_size(filter_window),
- dx, dy,
+ dx, dy, t,
stride,
pass_stride,
+ frame_offset,
+ use_time,
buffer,
transform, rank,
weight, XtWX, XtWY,
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index e5d3b0da835..31a7487c77a 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -18,9 +18,11 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
int storage_stride,
- int dx, int dy,
+ int dx, int dy, int t,
int buffer_stride,
int pass_stride,
+ int frame_offset,
+ bool use_time,
const ccl_global float *ccl_restrict buffer,
const ccl_global float *ccl_restrict transform,
ccl_global int *rank,
@@ -34,7 +36,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
}
int p_offset = y * buffer_stride + x;
- int q_offset = (y+dy) * buffer_stride + (x+dx);
+ int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset;
#ifdef __KERNEL_GPU__
const int stride = storage_stride;
@@ -57,9 +59,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
return;
}
- filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
- make_int2(x+dx, y+dy), buffer + q_offset,
- pass_stride, *rank, design_row, transform, stride);
+ filter_get_design_row_transform(make_int3(x, y, t), buffer + p_offset,
+ make_int3(x+dx, y+dy, t), buffer + q_offset,
+ pass_stride, *rank, design_row, transform, stride, use_time);
#ifdef __KERNEL_GPU__
math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
index a5f87c05ec0..94e27bb02fd 100644
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -17,8 +17,10 @@
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ CCL_FILTER_TILE_INFO,
int x, int y, int4 rect,
- int pass_stride,
+ int pass_stride, int frame_stride,
+ bool use_time,
float *transform, int *rank,
int radius, float pca_threshold)
{
@@ -26,59 +28,58 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
float features[DENOISE_FEATURES];
- /* Temporary storage, used in different steps of the algorithm. */
- float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
- float tempvector[2*DENOISE_FEATURES];
const float *ccl_restrict pixel_buffer;
- int2 pixel;
+ int3 pixel;
+
+ int num_features = use_time? 11 : 10;
/* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x);
+ int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
/* === Shift feature passes to have mean 0. === */
float feature_means[DENOISE_FEATURES];
- math_vector_zero(feature_means, DENOISE_FEATURES);
+ math_vector_zero(feature_means, num_features);
FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
- math_vector_add(feature_means, features, DENOISE_FEATURES);
+ filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+ math_vector_add(feature_means, features, num_features);
} END_FOR_PIXEL_WINDOW
- math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+ math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
- float *feature_scale = tempvector;
- math_vector_zero(feature_scale, DENOISE_FEATURES);
+ float feature_scale[DENOISE_FEATURES];
+ math_vector_zero(feature_scale, num_features);
FOR_PIXEL_WINDOW {
- filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
- math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, num_features);
} END_FOR_PIXEL_WINDOW
- filter_calculate_scale(feature_scale);
+ filter_calculate_scale(feature_scale, use_time);
/* === Generate the feature transformation. ===
- * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
- float* feature_matrix = tempmatrix;
- math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero(feature_matrix, num_features);
FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
- math_vector_mul(features, feature_scale, DENOISE_FEATURES);
- math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, num_features);
+ math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
} END_FOR_PIXEL_WINDOW
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
*rank = 0;
/* Prevent overfitting when a small window is used. */
- int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ int max_rank = min(num_features, num_pixels/3);
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
- for(int i = 0; i < DENOISE_FEATURES; i++) {
- threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ for(int i = 0; i < num_features; i++) {
+ threshold_energy += feature_matrix[i*num_features+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
@@ -86,13 +87,13 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
for(int i = 0; i < max_rank; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
@@ -100,9 +101,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < (*rank); i++) {
- math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+ math_vector_mul(transform + i*num_features, feature_scale, num_features);
}
- math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+ math_matrix_transpose(transform, num_features, 1);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index 83a1222bbdb..ed8ddcb49b1 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -17,8 +17,10 @@
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+ CCL_FILTER_TILE_INFO,
int x, int y, int4 rect,
- int pass_stride,
+ int pass_stride, int frame_stride,
+ bool use_time,
ccl_global float *transform,
ccl_global int *rank,
int radius, float pca_threshold,
@@ -33,60 +35,62 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
float features[DENOISE_FEATURES];
#endif
+ int num_features = use_time? 11 : 10;
+
/* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x);
+ int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
const ccl_global float *ccl_restrict pixel_buffer;
- int2 pixel;
+ int3 pixel;
/* === Shift feature passes to have mean 0. === */
float feature_means[DENOISE_FEATURES];
- math_vector_zero(feature_means, DENOISE_FEATURES);
+ math_vector_zero(feature_means, num_features);
FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
- math_vector_add(feature_means, features, DENOISE_FEATURES);
+ filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+ math_vector_add(feature_means, features, num_features);
} END_FOR_PIXEL_WINDOW
- math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+ math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
float feature_scale[DENOISE_FEATURES];
- math_vector_zero(feature_scale, DENOISE_FEATURES);
+ math_vector_zero(feature_scale, num_features);
FOR_PIXEL_WINDOW {
- filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
- math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, num_features);
} END_FOR_PIXEL_WINDOW
- filter_calculate_scale(feature_scale);
+ filter_calculate_scale(feature_scale, use_time);
/* === Generate the feature transformation. ===
- * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ math_matrix_zero(feature_matrix, num_features);
FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
- math_vector_mul(features, feature_scale, DENOISE_FEATURES);
- math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, num_features);
+ math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
} END_FOR_PIXEL_WINDOW
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
*rank = 0;
/* Prevent overfitting when a small window is used. */
- int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ int max_rank = min(num_features, num_pixels/3);
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
- for(int i = 0; i < DENOISE_FEATURES; i++) {
- threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ for(int i = 0; i < num_features; i++) {
+ threshold_energy += feature_matrix[i*num_features+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
@@ -94,24 +98,24 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
for(int i = 0; i < max_rank; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
}
- math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+ math_matrix_transpose(transform, num_features, transform_stride);
/* Bake the feature scaling into the transformation matrix. */
- for(int i = 0; i < DENOISE_FEATURES; i++) {
+ for(int i = 0; i < num_features; i++) {
for(int j = 0; j < (*rank); j++) {
- transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+ transform[(i*num_features + j)*transform_stride] *= feature_scale[i];
}
}
}
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 9e65f61664b..10bd3e477e9 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -17,8 +17,10 @@
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ CCL_FILTER_TILE_INFO,
int x, int y, int4 rect,
- int pass_stride,
+ int pass_stride, int frame_stride,
+ bool use_time,
float *transform, int *rank,
int radius, float pca_threshold)
{
@@ -26,55 +28,63 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
float4 features[DENOISE_FEATURES];
const float *ccl_restrict pixel_buffer;
- int2 pixel;
+ int3 pixel;
+ int num_features = use_time? 11 : 10;
+
+ /* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x);
+ int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
+ /* === Shift feature passes to have mean 0. === */
float4 feature_means[DENOISE_FEATURES];
- math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+ math_vector_zero_sse(feature_means, num_features);
FOR_PIXEL_WINDOW_SSE {
- filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
- math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+ filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
+ math_vector_add_sse(feature_means, num_features, features);
} END_FOR_PIXEL_WINDOW_SSE
float4 pixel_scale = make_float4(1.0f / num_pixels);
- for(int i = 0; i < DENOISE_FEATURES; i++) {
+ for(int i = 0; i < num_features; i++) {
feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
}
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
float4 feature_scale[DENOISE_FEATURES];
- math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+ math_vector_zero_sse(feature_scale, num_features);
FOR_PIXEL_WINDOW_SSE {
- filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
- math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+ filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_max_sse(feature_scale, features, num_features);
} END_FOR_PIXEL_WINDOW_SSE
- filter_calculate_scale_sse(feature_scale);
+ filter_calculate_scale_sse(feature_scale, use_time);
+ /* === Generate the feature transformation. ===
+ * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+ math_matrix_zero_sse(feature_matrix_sse, num_features);
FOR_PIXEL_WINDOW_SSE {
- filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
- math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
- math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
+ filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_mul_sse(features, num_features, feature_scale);
+ math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
} END_FOR_PIXEL_WINDOW_SSE
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+ math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
*rank = 0;
/* Prevent overfitting when a small window is used. */
- int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ int max_rank = min(num_features, num_pixels/3);
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
- for(int i = 0; i < DENOISE_FEATURES; i++) {
- threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ for(int i = 0; i < num_features; i++) {
+ threshold_energy += feature_matrix[i*num_features+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
@@ -82,23 +92,23 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
for(int i = 0; i < max_rank; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
}
- math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+ math_matrix_transpose(transform, num_features, 1);
/* Bake the feature scaling into the transformation matrix. */
- for(int i = 0; i < DENOISE_FEATURES; i++) {
- math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
+ for(int i = 0; i < num_features; i++) {
+ math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank);
}
}
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index 08333c7a455..02c85562db8 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -68,6 +68,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
int r);
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ TileInfo *tiles,
int x,
int y,
int storage_ofs,
@@ -75,6 +76,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
int *rank,
int* rect,
int pass_stride,
+ int frame_stride,
+ bool use_time,
int radius,
float pca_threshold);
@@ -87,6 +90,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int* rect,
int stride,
int channel_offset,
+ int frame_offset,
float a,
float k_2);
@@ -116,6 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int dy,
+ int t,
float *difference_image,
float *buffer,
float *transform,
@@ -126,7 +131,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int *filter_window,
int stride,
int f,
- int pass_stride);
+ int pass_stride,
+ int frame_offset,
+ bool use_time);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
float *accum_image,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index b792367e3ab..c29505880cb 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -135,6 +135,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
}
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ TileInfo *tile_info,
int x,
int y,
int storage_ofs,
@@ -142,6 +143,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
int *rank,
int* prefilter_rect,
int pass_stride,
+ int frame_stride,
+ bool use_time,
int radius,
float pca_threshold)
{
@@ -151,9 +154,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
rank += storage_ofs;
transform += storage_ofs*TRANSFORM_SIZE;
kernel_filter_construct_transform(buffer,
+ tile_info,
x, y,
load_int4(prefilter_rect),
pass_stride,
+ frame_stride,
+ use_time,
transform,
rank,
radius,
@@ -170,6 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int *rect,
int stride,
int channel_offset,
+ int frame_offset,
float a,
float k_2)
{
@@ -184,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
load_int4(rect),
stride,
channel_offset,
+ frame_offset,
a, k_2);
#endif
}
@@ -243,6 +251,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int dy,
+ int t,
float *difference_image,
float *buffer,
float *transform,
@@ -253,12 +262,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int *filter_window,
int stride,
int f,
- int pass_stride)
+ int pass_stride,
+ int frame_offset,
+ bool use_time)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
#else
- kernel_filter_nlm_construct_gramian(dx, dy,
+ kernel_filter_nlm_construct_gramian(dx, dy, t,
difference_image,
buffer,
transform, rank,
@@ -266,7 +277,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
load_int4(rect),
load_int4(filter_window),
stride, f,
- pass_stride);
+ pass_stride,
+ frame_offset,
+ use_time);
#endif
}
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index 3b51bb41aed..5b552b01413 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -29,7 +29,7 @@
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_divide_shadow(int sample,
- TileInfo *tile_info,
+ CCL_FILTER_TILE_INFO,
float *unfilteredA,
float *unfilteredB,
float *sampleVariance,
@@ -59,7 +59,7 @@ kernel_cuda_filter_divide_shadow(int sample,
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_get_feature(int sample,
- TileInfo *tile_info,
+ CCL_FILTER_TILE_INFO,
int m_offset,
int v_offset,
float *mean,
@@ -138,10 +138,12 @@ kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+ CCL_FILTER_TILE_INFO,
float *transform, int *rank,
int4 filter_area, int4 rect,
int radius, float pca_threshold,
- int pass_stride)
+ int pass_stride, int frame_stride,
+ bool use_time)
{
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
@@ -149,8 +151,11 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
int *l_rank = rank + y*filter_area.z + x;
float *l_transform = transform + y*filter_area.z + x;
kernel_filter_construct_transform(buffer,
+ tile_info,
x + filter_area.x, y + filter_area.y,
- rect, pass_stride,
+ rect,
+ pass_stride, frame_stride,
+ use_time,
l_transform, l_rank,
radius, pca_threshold,
filter_area.z*filter_area.w,
@@ -170,6 +175,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
int pass_stride,
int r,
int channel_offset,
+ int frame_offset,
float a,
float k_2)
{
@@ -183,6 +189,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
difference_image + ofs,
rect, stride,
channel_offset,
+ frame_offset,
a, k_2);
}
}
@@ -274,7 +281,8 @@ kernel_cuda_filter_nlm_normalize(float *out_image,
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_construct_gramian(int t,
+ const float *ccl_restrict difference_image,
const float *ccl_restrict buffer,
float const* __restrict__ transform,
int *rank,
@@ -286,13 +294,16 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
int stride,
int pass_stride,
int r,
- int f)
+ int f,
+ int frame_offset,
+ bool use_time)
{
int4 co, rect;
int ofs;
if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
kernel_filter_nlm_construct_gramian(co.x, co.y,
co.z, co.w,
+ t,
difference_image + ofs,
buffer,
transform, rank,
@@ -300,6 +311,8 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
rect, filter_window,
stride, f,
pass_stride,
+ frame_offset,
+ use_time,
threadIdx.y*blockDim.x + threadIdx.x);
}
}
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index 8a821ee281d..996bc27f71b 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -127,11 +127,14 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
}
__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+ CCL_FILTER_TILE_INFO,
ccl_global float *transform,
ccl_global int *rank,
int4 filter_area,
int4 rect,
int pass_stride,
+ int frame_stride,
+ char use_time,
int radius,
float pca_threshold)
{
@@ -141,8 +144,11 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
ccl_global int *l_rank = rank + y*filter_area.z + x;
ccl_global float *l_transform = transform + y*filter_area.z + x;
kernel_filter_construct_transform(buffer,
+ CCL_FILTER_TILE_INFO_ARG,
x + filter_area.x, y + filter_area.y,
- rect, pass_stride,
+ rect,
+ pass_stride, frame_stride,
+ use_time,
l_transform, l_rank,
radius, pca_threshold,
filter_area.z*filter_area.w,
@@ -160,6 +166,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
int pass_stride,
int r,
int channel_offset,
+ int frame_offset,
float a,
float k_2)
{
@@ -173,6 +180,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
difference_image + ofs,
rect, stride,
channel_offset,
+ frame_offset,
a, k_2);
}
}
@@ -254,7 +262,8 @@ __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
}
}
-__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image,
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int t,
+ const ccl_global float *ccl_restrict difference_image,
const ccl_global float *ccl_restrict buffer,
const ccl_global float *ccl_restrict transform,
ccl_global int *rank,
@@ -266,13 +275,16 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
int stride,
int pass_stride,
int r,
- int f)
+ int f,
+ int frame_offset,
+ char use_time)
{
int4 co, rect;
int ofs;
if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
kernel_filter_nlm_construct_gramian(co.x, co.y,
co.z, co.w,
+ t,
difference_image + ofs,
buffer,
transform, rank,
@@ -280,6 +292,8 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
rect, filter_window,
stride, f,
pass_stride,
+ frame_offset,
+ use_time,
get_local_id(1)*get_local_size(0) + get_local_id(0));
}
}