diff options
20 files changed, 388 insertions, 174 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 6668acc9cbe..93c63b92a55 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -186,15 +186,15 @@ public: KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel; KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel; KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; - KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; - KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, int, int, int, int, int, int, int, int, ccl_global int*, int, @@ -512,7 +512,7 @@ public: difference, local_rect, w, channel_offset, - a, k_2); + 0, a, k_2); filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); @@ -542,6 +542,7 @@ public: for(int y = 0; y < task->filter_area.w; y++) { for(int x = 0; x < task->filter_area.z; x++) { filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + task->tile_info, x + task->filter_area.x, y + task->filter_area.y, y*task->filter_area.z + x, @@ -549,6 +550,8 @@ public: (int*) task->storage.rank.device_pointer, &task->rect.x, task->buffer.pass_stride, + task->buffer.frame_stride, + task->buffer.use_time, task->radius, task->pca_threshold); } @@ -559,6 +562,7 @@ public: bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task) { ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); @@ -568,6 +572,7 @@ public: float *blurDifference = temporary_mem + task->buffer.pass_stride; int r = task->radius; + int frame_offset = frame * task->buffer.frame_stride; for(int i = 0; i < (2*r+1)*(2*r+1); i++) { int dy = i / (2*r+1) - r; int dx = i % (2*r+1) - r; @@ -583,12 +588,14 @@ public: local_rect, task->buffer.stride, task->buffer.pass_stride, + frame_offset, 1.0f, task->nlm_k_2); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_construct_gramian_kernel()(dx, dy, + task->tile_info->frames[frame], blurDifference, (float*) task->buffer.mem.device_pointer, (float*) task->storage.transform.device_pointer, @@ -599,7 +606,9 @@ public: &task->reconstruction_state.filter_window.x, task->buffer.stride, 4, - task->buffer.pass_stride); + task->buffer.pass_stride, + frame_offset, + task->buffer.use_time); } return true; @@ -787,7 +796,7 @@ public: tile.sample = tile.start_sample + tile.num_samples; denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index cb7d8bbb224..e21d974ebbe 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1301,6 +1301,7 @@ public: int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; + int frame_offset = 0; if(have_error()) return false; @@ -1327,7 +1328,7 @@ public: CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); - void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2}; + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f}; @@ -1367,13 +1368,16 @@ public: task->storage.h); void *args[] = {&task->buffer.mem.device_pointer, + &task->tile_info_mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, &task->filter_area, &task->rect, &task->radius, &task->pca_threshold, - &task->buffer.pass_stride}; + &task->buffer.pass_stride, + &task->buffer.frame_stride, + &task->buffer.use_time}; CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); cuda_assert(cuCtxSynchronize()); @@ -1383,6 +1387,7 @@ public: bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task) { if(have_error()) @@ -1398,6 +1403,8 @@ public: int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); @@ -1430,10 +1437,12 @@ public: &w, &h, &stride, &pass_stride, &r, &pass_stride, + &frame_offset, &a, &k_2}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&blurDifference, + void *construct_gramian_args[] = {&t, + &blurDifference, &task->buffer.mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, @@ -1442,7 +1451,9 @@ public: &task->reconstruction_state.filter_window, &w, &h, &stride, &pass_stride, &r, - &f}; + &f, + &frame_offset, + &task->buffer.use_time}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1635,7 +1646,7 @@ public: void denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 724171c3acb..61e0ba47ab8 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -36,6 +36,7 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); } + render_buffer.frame_stride = task.frame_stride; render_buffer.pass_stride = task.pass_stride; render_buffer.offset = task.pass_denoising_data; @@ -49,6 +50,12 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); tile_info->from_render = task.denoising_from_render? 1 : 0; + tile_info->frames[0] = 0; + tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES); + for(int i = 1; i < tile_info->num_frames; i++) { + tile_info->frames[i] = task.denoising_frames[i-1]; + } + write_passes = task.denoising_write_passes; do_filter = task.denoising_do_filter; } @@ -101,16 +108,18 @@ void DenoisingTask::setup_denoising_buffer() rect = rect_expand(rect, radius); rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - buffer.use_intensity = write_passes; + buffer.use_intensity = write_passes || (tile_info->num_frames > 1); buffer.passes = buffer.use_intensity? 15 : 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); + buffer.frame_stride = buffer.pass_stride * buffer.passes; /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ - int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats); + int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats); buffer.mem.alloc_to_device(mem_size, false); + buffer.use_time = (tile_info->num_frames > 1); /* CPUs process shifts sequentially while GPUs process them in parallel. */ int num_layers; @@ -216,6 +225,25 @@ void DenoisingTask::prefilter_color() } } +void DenoisingTask::load_buffer() +{ + device_ptr null_ptr = (device_ptr) 0; + + int original_offset = render_buffer.offset; + + int num_passes = buffer.use_intensity? 15 : 14; + for(int i = 0; i < tile_info->num_frames; i++) { + for(int pass = 0; pass < num_passes; pass++) { + device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride); + bool is_variance = (pass >= 11) && (pass <= 13); + functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f); + } + render_buffer.offset += render_buffer.frame_stride; + } + + render_buffer.offset = original_offset; +} + void DenoisingTask::write_buffer() { reconstruction_state.buffer_params = make_int4(target_buffer.offset, @@ -259,11 +287,17 @@ void DenoisingTask::reconstruct() device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); - - device_ptr scale_ptr = 0; - device_sub_ptr *scale_sub_ptr = NULL; - functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr); - delete scale_sub_ptr; + for(int f = 0; f < tile_info->num_frames; f++) { + device_ptr scale_ptr = 0; + device_sub_ptr *scale_sub_ptr = NULL; + if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) { + scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); + scale_ptr = **scale_sub_ptr; + } + + functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f); + delete scale_sub_ptr; + } functions.solve(target_buffer.ptr); } @@ -276,9 +310,14 @@ void DenoisingTask::run_denoising(RenderTile *tile) setup_denoising_buffer(); - prefilter_shadowing(); - prefilter_features(); - prefilter_color(); + if(tile_info->from_render) { + prefilter_shadowing(); + prefilter_features(); + prefilter_color(); + } + else { + load_buffer(); + } if(do_filter) { construct_transform(); diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index cddcd3bd0c9..5869aa05390 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -38,6 +38,7 @@ public: struct RenderBuffers { int offset; int pass_stride; + int frame_stride; int samples; } render_buffer; @@ -70,7 +71,8 @@ public: )> non_local_means; function<bool(device_ptr color_ptr, device_ptr color_variance_ptr, - device_ptr scale_ptr + device_ptr scale_ptr, + int frame )> accumulate; function<bool(device_ptr output_ptr)> solve; function<bool()> construct_transform; @@ -156,8 +158,10 @@ public: int stride; int h; int width; + int frame_stride; device_only_memory<float> mem; device_only_memory<float> temporary_mem; + bool use_time; bool use_intensity; bool gpu_temporary_mem; @@ -179,6 +183,7 @@ protected: void construct_transform(); void reconstruct(); + void load_buffer(); void write_buffer(); }; diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 97bcde99af6..2871bc5761a 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -73,11 +73,13 @@ public: float denoising_feature_strength; bool denoising_relative_pca; bool denoising_from_render; + vector<int> denoising_frames; bool denoising_do_filter; bool denoising_write_passes; int pass_stride; + int frame_stride; int target_pass_stride; int pass_denoising_data; int pass_denoising_clean; diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 4d42ddc0c53..9b763167459 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -422,6 +422,7 @@ protected: bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task); bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index a0a1cf68c32..4417065bb7f 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -821,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + char use_time = task->buffer.use_time? 1 : 0; cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - kernel_set_args(ckFilterConstructTransform, 0, - buffer_mem, + int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterConstructTransform, + arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterConstructTransform, + arg_ofs, transform_mem, rank_mem, task->filter_area, task->rect, task->buffer.pass_stride, + task->buffer.frame_stride, + use_time, task->radius, task->pca_threshold); @@ -845,6 +860,7 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task) { cl_mem color_mem = CL_MEM_PTR(color_ptr); @@ -865,6 +881,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + char use_time = task->buffer.use_time? 1 : 0; int r = task->radius; int pass_stride = task->buffer.pass_stride; @@ -884,6 +903,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, pass_stride, r, pass_stride, + frame_offset, 1.0f, task->nlm_k_2); kernel_set_args(ckNLMBlur, 0, difference_mem, @@ -898,6 +918,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, pass_stride, r, 4); kernel_set_args(ckNLMConstructGramian, 0, + t, blurDifference_mem, buffer_mem, transform_mem, @@ -907,7 +928,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, task->reconstruction_state.filter_window, w, h, stride, pass_stride, - r, 4); + r, 4, + frame_offset, + use_time); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); @@ -1108,7 +1131,7 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h index 9ac7c3db23d..cb04aac35f4 100644 --- a/intern/cycles/kernel/filter/filter_defines.h +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -17,17 +17,21 @@ #ifndef __FILTER_DEFINES_H__ #define __FILTER_DEFINES_H__ -#define DENOISE_FEATURES 10 +#define DENOISE_FEATURES 11 #define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) #define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) #define XTWY_SIZE (DENOISE_FEATURES+1) +#define DENOISE_MAX_FRAMES 16 + typedef struct TileInfo { int offsets[9]; int strides[9]; int x[4]; int y[4]; int from_render; + int frames[DENOISE_MAX_FRAMES]; + int num_frames; /* TODO(lukas): CUDA doesn't have uint64_t... */ #ifdef __KERNEL_OPENCL__ ccl_global float *buffers[9]; diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h index 6226ed2c2ef..e1ea6487aa9 100644 --- a/intern/cycles/kernel/filter/filter_features.h +++ b/intern/cycles/kernel/filter/filter_features.h @@ -18,19 +18,23 @@ #define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] -/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). - * pixel_buffer always points to the current pixel in the first pass. */ -#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ - for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass. + * Repeat the loop for every secondary frame if there are any. */ +#define FOR_PIXEL_WINDOW for(int frame = 0; frame < tile_info->num_frames; frame++) { \ + pixel.z = tile_info->frames[frame]; \ + pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { -#define END_FOR_PIXEL_WINDOW } \ - pixel_buffer += buffer_w - (high.x - low.x); \ +#define END_FOR_PIXEL_WINDOW } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } \ } -ccl_device_inline void filter_get_features(int2 pixel, +ccl_device_inline void filter_get_features(int3 pixel, const ccl_global float *ccl_restrict buffer, float *features, + bool use_time, const float *ccl_restrict mean, int pass_stride) { @@ -44,15 +48,20 @@ ccl_device_inline void filter_get_features(int2 pixel, features[7] = ccl_get_feature(buffer, 5); features[8] = ccl_get_feature(buffer, 6); features[9] = ccl_get_feature(buffer, 7); + if(use_time) { + features[10] = pixel.z; + } if(mean) { - for(int i = 0; i < DENOISE_FEATURES; i++) + for(int i = 0; i < (use_time? 11 : 10); i++) { features[i] -= mean[i]; + } } } -ccl_device_inline void filter_get_feature_scales(int2 pixel, +ccl_device_inline void filter_get_feature_scales(int3 pixel, const ccl_global float *ccl_restrict buffer, float *scales, + bool use_time, const float *ccl_restrict mean, int pass_stride) { @@ -66,13 +75,19 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel, scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], ccl_get_feature(buffer, 6) - mean[8], ccl_get_feature(buffer, 7) - mean[9])); + if(use_time) { + scales[6] = fabsf(pixel.z - mean[10]); + } } -ccl_device_inline void filter_calculate_scale(float *scale) +ccl_device_inline void filter_calculate_scale(float *scale, bool use_time) { scale[0] = 1.0f/max(scale[0], 0.01f); scale[1] = 1.0f/max(scale[1], 0.01f); scale[2] = 1.0f/max(scale[2], 0.01f); + if(use_time) { + scale[10] = 1.0f/max(scale[6], 0.01f); + } scale[6] = 1.0f/max(scale[4], 0.01f); scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); @@ -89,36 +104,46 @@ ccl_device_inline void design_row_add(float *design_row, const ccl_global float *ccl_restrict transform, int stride, int row, - float feature) + float feature, + int transform_row_stride) { for(int i = 0; i < rank; i++) { - design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature; } } /* Fill the design row. */ -ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, +ccl_device_inline void filter_get_design_row_transform(int3 p_pixel, const ccl_global float *ccl_restrict p_buffer, - int2 q_pixel, + int3 q_pixel, const ccl_global float *ccl_restrict q_buffer, int pass_stride, int rank, float *design_row, const ccl_global float *ccl_restrict transform, - int stride) + int stride, + bool use_time) { + int num_features = use_time? 11 : 10; + design_row[0] = 1.0f; math_vector_zero(design_row+1, rank); - design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); - design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); - design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); - design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); - design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); - design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); - design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); - design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); - design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); - design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); + +#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features); + DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x); + DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y); + DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); + DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); + if(use_time) { + DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z) + } +#undef DESIGN_ROW_ADD } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 3ddd8712266..5dd001ffb93 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -20,26 +20,33 @@ CCL_NAMESPACE_BEGIN /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. * pixel_buffer always points to the first of the 4 current pixel in the first pass. - * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. + * Repeat the loop for every secondary frame if there are any. */ +#define FOR_PIXEL_WINDOW_SSE for(int frame = 0; frame < tile_info->num_frames; frame++) { \ + pixel.z = tile_info->frames[frame]; \ + pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \ + float4 t4 = make_float4(pixel.z); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + float4 y4 = make_float4(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); -#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ - for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - float4 y4 = make_float4(pixel.y); \ - for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ - int4 active_pixels = x4 < make_float4(high.x); - -#define END_FOR_PIXEL_WINDOW_SSE } \ - pixel_buffer += buffer_w - (pixel.x - low.x); \ +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } \ } -ccl_device_inline void filter_get_features_sse(float4 x, float4 y, +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *features, + bool use_time, const float4 *ccl_restrict mean, int pass_stride) { + int num_features = use_time? 11 : 10; + features[0] = x; features[1] = y; features[2] = fabs(ccl_get_feature_sse(0)); @@ -50,18 +57,25 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y, features[7] = ccl_get_feature_sse(5); features[8] = ccl_get_feature_sse(6); features[9] = ccl_get_feature_sse(7); + if(use_time) { + features[10] = t; + } + if(mean) { - for(int i = 0; i < DENOISE_FEATURES; i++) + for(int i = 0; i < num_features; i++) { features[i] = features[i] - mean[i]; + } } - for(int i = 0; i < DENOISE_FEATURES; i++) + for(int i = 0; i < num_features; i++) { features[i] = mask(active_pixels, features[i]); + } } -ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *scales, + bool use_time, const float4 *ccl_restrict mean, int pass_stride) { @@ -75,15 +89,22 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) + sqr(ccl_get_feature_sse(7) - mean[9]); - for(int i = 0; i < 6; i++) + if(use_time) { + scales[6] = fabs(t - mean[10]); + } + + for(int i = 0; i < (use_time? 7 : 6); i++) scales[i] = mask(active_pixels, scales[i]); } -ccl_device_inline void filter_calculate_scale_sse(float4 *scale) +ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time) { scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + if(use_time) { + scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));; + } scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 0c4387af540..9eb3c603a4a 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -27,6 +27,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, int4 rect, int stride, int channel_offset, + int frame_offset, float a, float k_2) { @@ -39,7 +40,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, for(int y = rect.y; y < rect.w; y++) { int idx_p = y*stride + aligned_lowx; - int idx_q = (y+dy)*stride + aligned_lowx + dx; + int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset; for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { float4 diff = make_float4(0.0f); float4 scale_fac; @@ -181,7 +182,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, } } -ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t, const float *ccl_restrict difference_image, const float *ccl_restrict buffer, float *transform, @@ -191,7 +192,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int4 rect, int4 filter_window, int stride, int f, - int pass_stride) + int pass_stride, + int frame_offset, + bool use_time) { int4 clip_area = rect_clip(rect, filter_window); /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ @@ -212,9 +215,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int *l_rank = rank + storage_ofs; kernel_filter_construct_gramian(x, y, 1, - dx, dy, + dx, dy, t, stride, pass_stride, + frame_offset, + use_time, buffer, l_transform, l_rank, weight, l_XtWX, l_XtWY, 0); diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index d8e2e4d08aa..12636393243 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -82,9 +82,10 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, ccl_global float *difference_image, int4 rect, int stride, int channel_offset, + int frame_offset, float a, float k_2) { - int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset; int numChannels = channel_offset? 3 : 1; float diff = 0.0f; @@ -170,7 +171,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, } ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, - int dx, int dy, + int dx, int dy, int t, const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, @@ -181,6 +182,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, int4 filter_window, int stride, int f, int pass_stride, + int frame_offset, + bool use_time, int localIdx) { const int low = max(rect.x, x-f); @@ -201,9 +204,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, kernel_filter_construct_gramian(x, y, rect_size(filter_window), - dx, dy, + dx, dy, t, stride, pass_stride, + frame_offset, + use_time, buffer, transform, rank, weight, XtWX, XtWY, diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index e5d3b0da835..31a7487c77a 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -18,9 +18,11 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int storage_stride, - int dx, int dy, + int dx, int dy, int t, int buffer_stride, int pass_stride, + int frame_offset, + bool use_time, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, ccl_global int *rank, @@ -34,7 +36,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, } int p_offset = y * buffer_stride + x; - int q_offset = (y+dy) * buffer_stride + (x+dx); + int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset; #ifdef __KERNEL_GPU__ const int stride = storage_stride; @@ -57,9 +59,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, return; } - filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, - make_int2(x+dx, y+dy), buffer + q_offset, - pass_stride, *rank, design_row, transform, stride); + filter_get_design_row_transform(make_int3(x, y, t), buffer + p_offset, + make_int3(x+dx, y+dy, t), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride, use_time); #ifdef __KERNEL_GPU__ math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h index a5f87c05ec0..94e27bb02fd 100644 --- a/intern/cycles/kernel/filter/filter_transform.h +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -17,8 +17,10 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, int x, int y, int4 rect, - int pass_stride, + int pass_stride, int frame_stride, + bool use_time, float *transform, int *rank, int radius, float pca_threshold) { @@ -26,59 +28,58 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff float features[DENOISE_FEATURES]; - /* Temporary storage, used in different steps of the algorithm. */ - float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; - float tempvector[2*DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; - int2 pixel; + int3 pixel; + + int num_features = use_time? 11 : 10; /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; /* === Shift feature passes to have mean 0. === */ float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, DENOISE_FEATURES); + math_vector_zero(feature_means, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); - math_vector_add(feature_means, features, DENOISE_FEATURES); + filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add(feature_means, features, num_features); } END_FOR_PIXEL_WINDOW - math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + math_vector_scale(feature_means, 1.0f / num_pixels, num_features); /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ - float *feature_scale = tempvector; - math_vector_zero(feature_scale, DENOISE_FEATURES); + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, num_features); FOR_PIXEL_WINDOW { - filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_max(feature_scale, features, DENOISE_FEATURES); + filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max(feature_scale, features, num_features); } END_FOR_PIXEL_WINDOW - filter_calculate_scale(feature_scale); + filter_calculate_scale(feature_scale, use_time); /* === Generate the feature transformation. === - * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ - float* feature_matrix = tempmatrix; - math_matrix_zero(feature_matrix, DENOISE_FEATURES); + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_mul(features, feature_scale, DENOISE_FEATURES); - math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul(features, feature_scale, num_features); + math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); } END_FOR_PIXEL_WINDOW - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); *rank = 0; /* Prevent overfitting when a small window is used. */ - int max_rank = min(DENOISE_FEATURES, num_pixels/3); + int max_rank = min(num_features, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++) { - threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + for(int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i*num_features+i]; } threshold_energy *= 1.0f - (-pca_threshold); @@ -86,13 +87,13 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; reduced_energy += s; } } else { for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; } @@ -100,9 +101,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff /* Bake the feature scaling into the transformation matrix. */ for(int i = 0; i < (*rank); i++) { - math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); + math_vector_mul(transform + i*num_features, feature_scale, num_features); } - math_matrix_transpose(transform, DENOISE_FEATURES, 1); + math_matrix_transpose(transform, num_features, 1); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h index 83a1222bbdb..ed8ddcb49b1 100644 --- a/intern/cycles/kernel/filter/filter_transform_gpu.h +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -17,8 +17,10 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, int x, int y, int4 rect, - int pass_stride, + int pass_stride, int frame_stride, + bool use_time, ccl_global float *transform, ccl_global int *rank, int radius, float pca_threshold, @@ -33,60 +35,62 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re float features[DENOISE_FEATURES]; #endif + int num_features = use_time? 11 : 10; + /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; const ccl_global float *ccl_restrict pixel_buffer; - int2 pixel; + int3 pixel; /* === Shift feature passes to have mean 0. === */ float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, DENOISE_FEATURES); + math_vector_zero(feature_means, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); - math_vector_add(feature_means, features, DENOISE_FEATURES); + filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add(feature_means, features, num_features); } END_FOR_PIXEL_WINDOW - math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + math_vector_scale(feature_means, 1.0f / num_pixels, num_features); /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ float feature_scale[DENOISE_FEATURES]; - math_vector_zero(feature_scale, DENOISE_FEATURES); + math_vector_zero(feature_scale, num_features); FOR_PIXEL_WINDOW { - filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_max(feature_scale, features, DENOISE_FEATURES); + filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max(feature_scale, features, num_features); } END_FOR_PIXEL_WINDOW - filter_calculate_scale(feature_scale); + filter_calculate_scale(feature_scale, use_time); /* === Generate the feature transformation. === - * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_zero(feature_matrix, DENOISE_FEATURES); + math_matrix_zero(feature_matrix, num_features); FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); - math_vector_mul(features, feature_scale, DENOISE_FEATURES); - math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul(features, feature_scale, num_features); + math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); } END_FOR_PIXEL_WINDOW - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride); *rank = 0; /* Prevent overfitting when a small window is used. */ - int max_rank = min(DENOISE_FEATURES, num_pixels/3); + int max_rank = min(num_features, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++) { - threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + for(int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i*num_features+i]; } threshold_energy *= 1.0f - (-pca_threshold); @@ -94,24 +98,24 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; reduced_energy += s; } } else { for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; } } - math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride); + math_matrix_transpose(transform, num_features, transform_stride); /* Bake the feature scaling into the transformation matrix. */ - for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int i = 0; i < num_features; i++) { for(int j = 0; j < (*rank); j++) { - transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i]; + transform[(i*num_features + j)*transform_stride] *= feature_scale[i]; } } } diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 9e65f61664b..10bd3e477e9 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -17,8 +17,10 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, int x, int y, int4 rect, - int pass_stride, + int pass_stride, int frame_stride, + bool use_time, float *transform, int *rank, int radius, float pca_threshold) { @@ -26,55 +28,63 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff float4 features[DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; - int2 pixel; + int3 pixel; + int num_features = use_time? 11 : 10; + + /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; + /* === Shift feature passes to have mean 0. === */ float4 feature_means[DENOISE_FEATURES]; - math_vector_zero_sse(feature_means, DENOISE_FEATURES); + math_vector_zero_sse(feature_means, num_features); FOR_PIXEL_WINDOW_SSE { - filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); - math_vector_add_sse(feature_means, DENOISE_FEATURES, features); + filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add_sse(feature_means, num_features, features); } END_FOR_PIXEL_WINDOW_SSE float4 pixel_scale = make_float4(1.0f / num_pixels); - for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int i = 0; i < num_features; i++) { feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; } + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ float4 feature_scale[DENOISE_FEATURES]; - math_vector_zero_sse(feature_scale, DENOISE_FEATURES); + math_vector_zero_sse(feature_scale, num_features); FOR_PIXEL_WINDOW_SSE { - filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); - math_vector_max_sse(feature_scale, features, DENOISE_FEATURES); + filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, num_features); } END_FOR_PIXEL_WINDOW_SSE - filter_calculate_scale_sse(feature_scale); + filter_calculate_scale_sse(feature_scale, use_time); + /* === Generate the feature transformation. === + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); + math_matrix_zero_sse(feature_matrix_sse, num_features); FOR_PIXEL_WINDOW_SSE { - filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); - math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); + filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul_sse(features, num_features, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f)); } END_FOR_PIXEL_WINDOW_SSE float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse); + math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse); - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); *rank = 0; /* Prevent overfitting when a small window is used. */ - int max_rank = min(DENOISE_FEATURES, num_pixels/3); + int max_rank = min(num_features, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++) { - threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + for(int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i*num_features+i]; } threshold_energy *= 1.0f - (-pca_threshold); @@ -82,23 +92,23 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; reduced_energy += s; } } else { for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*DENOISE_FEATURES+i]; + float s = feature_matrix[i*num_features+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; } } - math_matrix_transpose(transform, DENOISE_FEATURES, 1); + math_matrix_transpose(transform, num_features, 1); /* Bake the feature scaling into the transformation matrix. */ - for(int i = 0; i < DENOISE_FEATURES; i++) { - math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); + for(int i = 0; i < num_features; i++) { + math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank); } } diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index 08333c7a455..02c85562db8 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -68,6 +68,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, int r); void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + TileInfo *tiles, int x, int y, int storage_ofs, @@ -75,6 +76,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, int *rank, int* rect, int pass_stride, + int frame_stride, + bool use_time, int radius, float pca_threshold); @@ -87,6 +90,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int* rect, int stride, int channel_offset, + int frame_offset, float a, float k_2); @@ -116,6 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, + int t, float *difference_image, float *buffer, float *transform, @@ -126,7 +131,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int *filter_window, int stride, int f, - int pass_stride); + int pass_stride, + int frame_offset, + bool use_time); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, float *accum_image, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index b792367e3ab..c29505880cb 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -135,6 +135,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, } void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + TileInfo *tile_info, int x, int y, int storage_ofs, @@ -142,6 +143,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, int *rank, int* prefilter_rect, int pass_stride, + int frame_stride, + bool use_time, int radius, float pca_threshold) { @@ -151,9 +154,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, rank += storage_ofs; transform += storage_ofs*TRANSFORM_SIZE; kernel_filter_construct_transform(buffer, + tile_info, x, y, load_int4(prefilter_rect), pass_stride, + frame_stride, + use_time, transform, rank, radius, @@ -170,6 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int *rect, int stride, int channel_offset, + int frame_offset, float a, float k_2) { @@ -184,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, load_int4(rect), stride, channel_offset, + frame_offset, a, k_2); #endif } @@ -243,6 +251,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, + int t, float *difference_image, float *buffer, float *transform, @@ -253,12 +262,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int *filter_window, int stride, int f, - int pass_stride) + int pass_stride, + int frame_offset, + bool use_time) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); #else - kernel_filter_nlm_construct_gramian(dx, dy, + kernel_filter_nlm_construct_gramian(dx, dy, t, difference_image, buffer, transform, rank, @@ -266,7 +277,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, load_int4(rect), load_int4(filter_window), stride, f, - pass_stride); + pass_stride, + frame_offset, + use_time); #endif } diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 3b51bb41aed..5b552b01413 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -29,7 +29,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_divide_shadow(int sample, - TileInfo *tile_info, + CCL_FILTER_TILE_INFO, float *unfilteredA, float *unfilteredB, float *sampleVariance, @@ -59,7 +59,7 @@ kernel_cuda_filter_divide_shadow(int sample, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_get_feature(int sample, - TileInfo *tile_info, + CCL_FILTER_TILE_INFO, int m_offset, int v_offset, float *mean, @@ -138,10 +138,12 @@ kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, + CCL_FILTER_TILE_INFO, float *transform, int *rank, int4 filter_area, int4 rect, int radius, float pca_threshold, - int pass_stride) + int pass_stride, int frame_stride, + bool use_time) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; @@ -149,8 +151,11 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, int *l_rank = rank + y*filter_area.z + x; float *l_transform = transform + y*filter_area.z + x; kernel_filter_construct_transform(buffer, + tile_info, x + filter_area.x, y + filter_area.y, - rect, pass_stride, + rect, + pass_stride, frame_stride, + use_time, l_transform, l_rank, radius, pca_threshold, filter_area.z*filter_area.w, @@ -170,6 +175,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, int pass_stride, int r, int channel_offset, + int frame_offset, float a, float k_2) { @@ -183,6 +189,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, difference_image + ofs, rect, stride, channel_offset, + frame_offset, a, k_2); } } @@ -274,7 +281,8 @@ kernel_cuda_filter_nlm_normalize(float *out_image, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image, +kernel_cuda_filter_nlm_construct_gramian(int t, + const float *ccl_restrict difference_image, const float *ccl_restrict buffer, float const* __restrict__ transform, int *rank, @@ -286,13 +294,16 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im int stride, int pass_stride, int r, - int f) + int f, + int frame_offset, + bool use_time) { int4 co, rect; int ofs; if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, + t, difference_image + ofs, buffer, transform, rank, @@ -300,6 +311,8 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im rect, filter_window, stride, f, pass_stride, + frame_offset, + use_time, threadIdx.y*blockDim.x + threadIdx.x); } } diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index 8a821ee281d..996bc27f71b 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -127,11 +127,14 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, } __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + CCL_FILTER_TILE_INFO, ccl_global float *transform, ccl_global int *rank, int4 filter_area, int4 rect, int pass_stride, + int frame_stride, + char use_time, int radius, float pca_threshold) { @@ -141,8 +144,11 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_ ccl_global int *l_rank = rank + y*filter_area.z + x; ccl_global float *l_transform = transform + y*filter_area.z + x; kernel_filter_construct_transform(buffer, + CCL_FILTER_TILE_INFO_ARG, x + filter_area.x, y + filter_area.y, - rect, pass_stride, + rect, + pass_stride, frame_stride, + use_time, l_transform, l_rank, radius, pca_threshold, filter_area.z*filter_area.w, @@ -160,6 +166,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ int pass_stride, int r, int channel_offset, + int frame_offset, float a, float k_2) { @@ -173,6 +180,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ difference_image + ofs, rect, stride, channel_offset, + frame_offset, a, k_2); } } @@ -254,7 +262,8 @@ __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, } } -__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image, +__kernel void kernel_ocl_filter_nlm_construct_gramian(int t, + const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, ccl_global int *rank, @@ -266,13 +275,16 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc int stride, int pass_stride, int r, - int f) + int f, + int frame_offset, + char use_time) { int4 co, rect; int ofs; if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, + t, difference_image + ofs, buffer, transform, rank, @@ -280,6 +292,8 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc rect, filter_window, stride, f, pass_stride, + frame_offset, + use_time, get_local_id(1)*get_local_size(0) + get_local_id(0)); } } |