diff options
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 23 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 21 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.cpp | 59 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.h | 7 | ||||
-rw-r--r-- | intern/cycles/device/device_task.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl.h | 1 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 31 |
7 files changed, 117 insertions, 27 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 6668acc9cbe..93c63b92a55 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -186,15 +186,15 @@ public: KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel; KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel; KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; - KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; - KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, int, int, int, int, int, int, int, int, ccl_global int*, int, @@ -512,7 +512,7 @@ public: difference, local_rect, w, channel_offset, - a, k_2); + 0, a, k_2); filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); @@ -542,6 +542,7 @@ public: for(int y = 0; y < task->filter_area.w; y++) { for(int x = 0; x < task->filter_area.z; x++) { filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + task->tile_info, x + task->filter_area.x, y + task->filter_area.y, y*task->filter_area.z + x, @@ -549,6 +550,8 @@ public: (int*) task->storage.rank.device_pointer, &task->rect.x, task->buffer.pass_stride, + task->buffer.frame_stride, + task->buffer.use_time, task->radius, task->pca_threshold); } @@ -559,6 +562,7 @@ public: bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task) { ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); @@ -568,6 +572,7 @@ public: float *blurDifference = temporary_mem + task->buffer.pass_stride; int r = task->radius; + int frame_offset = frame * task->buffer.frame_stride; for(int i = 0; i < (2*r+1)*(2*r+1); i++) { int dy = i / (2*r+1) - r; int dx = i % (2*r+1) - r; @@ -583,12 +588,14 @@ public: local_rect, task->buffer.stride, task->buffer.pass_stride, + frame_offset, 1.0f, task->nlm_k_2); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_construct_gramian_kernel()(dx, dy, + task->tile_info->frames[frame], blurDifference, (float*) task->buffer.mem.device_pointer, (float*) task->storage.transform.device_pointer, @@ -599,7 +606,9 @@ public: &task->reconstruction_state.filter_window.x, task->buffer.stride, 4, - task->buffer.pass_stride); + task->buffer.pass_stride, + frame_offset, + task->buffer.use_time); } return true; @@ -787,7 +796,7 @@ public: tile.sample = tile.start_sample + tile.num_samples; denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index cb7d8bbb224..e21d974ebbe 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1301,6 +1301,7 @@ public: int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; + int frame_offset = 0; if(have_error()) return false; @@ -1327,7 +1328,7 @@ public: CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); - void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2}; + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f}; @@ -1367,13 +1368,16 @@ public: task->storage.h); void *args[] = {&task->buffer.mem.device_pointer, + &task->tile_info_mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, &task->filter_area, &task->rect, &task->radius, &task->pca_threshold, - &task->buffer.pass_stride}; + &task->buffer.pass_stride, + &task->buffer.frame_stride, + &task->buffer.use_time}; CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); cuda_assert(cuCtxSynchronize()); @@ -1383,6 +1387,7 @@ public: bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task) { if(have_error()) @@ -1398,6 +1403,8 @@ public: int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); @@ -1430,10 +1437,12 @@ public: &w, &h, &stride, &pass_stride, &r, &pass_stride, + &frame_offset, &a, &k_2}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&blurDifference, + void *construct_gramian_args[] = {&t, + &blurDifference, &task->buffer.mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, @@ -1442,7 +1451,9 @@ public: &task->reconstruction_state.filter_window, &w, &h, &stride, &pass_stride, &r, - &f}; + &f, + &frame_offset, + &task->buffer.use_time}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1635,7 +1646,7 @@ public: void denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 724171c3acb..61e0ba47ab8 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -36,6 +36,7 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); } + render_buffer.frame_stride = task.frame_stride; render_buffer.pass_stride = task.pass_stride; render_buffer.offset = task.pass_denoising_data; @@ -49,6 +50,12 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); tile_info->from_render = task.denoising_from_render? 1 : 0; + tile_info->frames[0] = 0; + tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES); + for(int i = 1; i < tile_info->num_frames; i++) { + tile_info->frames[i] = task.denoising_frames[i-1]; + } + write_passes = task.denoising_write_passes; do_filter = task.denoising_do_filter; } @@ -101,16 +108,18 @@ void DenoisingTask::setup_denoising_buffer() rect = rect_expand(rect, radius); rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - buffer.use_intensity = write_passes; + buffer.use_intensity = write_passes || (tile_info->num_frames > 1); buffer.passes = buffer.use_intensity? 15 : 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); + buffer.frame_stride = buffer.pass_stride * buffer.passes; /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ - int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats); + int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats); buffer.mem.alloc_to_device(mem_size, false); + buffer.use_time = (tile_info->num_frames > 1); /* CPUs process shifts sequentially while GPUs process them in parallel. */ int num_layers; @@ -216,6 +225,25 @@ void DenoisingTask::prefilter_color() } } +void DenoisingTask::load_buffer() +{ + device_ptr null_ptr = (device_ptr) 0; + + int original_offset = render_buffer.offset; + + int num_passes = buffer.use_intensity? 15 : 14; + for(int i = 0; i < tile_info->num_frames; i++) { + for(int pass = 0; pass < num_passes; pass++) { + device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride); + bool is_variance = (pass >= 11) && (pass <= 13); + functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f); + } + render_buffer.offset += render_buffer.frame_stride; + } + + render_buffer.offset = original_offset; +} + void DenoisingTask::write_buffer() { reconstruction_state.buffer_params = make_int4(target_buffer.offset, @@ -259,11 +287,17 @@ void DenoisingTask::reconstruct() device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); - - device_ptr scale_ptr = 0; - device_sub_ptr *scale_sub_ptr = NULL; - functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr); - delete scale_sub_ptr; + for(int f = 0; f < tile_info->num_frames; f++) { + device_ptr scale_ptr = 0; + device_sub_ptr *scale_sub_ptr = NULL; + if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) { + scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); + scale_ptr = **scale_sub_ptr; + } + + functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f); + delete scale_sub_ptr; + } functions.solve(target_buffer.ptr); } @@ -276,9 +310,14 @@ void DenoisingTask::run_denoising(RenderTile *tile) setup_denoising_buffer(); - prefilter_shadowing(); - prefilter_features(); - prefilter_color(); + if(tile_info->from_render) { + prefilter_shadowing(); + prefilter_features(); + prefilter_color(); + } + else { + load_buffer(); + } if(do_filter) { construct_transform(); diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index cddcd3bd0c9..5869aa05390 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -38,6 +38,7 @@ public: struct RenderBuffers { int offset; int pass_stride; + int frame_stride; int samples; } render_buffer; @@ -70,7 +71,8 @@ public: )> non_local_means; function<bool(device_ptr color_ptr, device_ptr color_variance_ptr, - device_ptr scale_ptr + device_ptr scale_ptr, + int frame )> accumulate; function<bool(device_ptr output_ptr)> solve; function<bool()> construct_transform; @@ -156,8 +158,10 @@ public: int stride; int h; int width; + int frame_stride; device_only_memory<float> mem; device_only_memory<float> temporary_mem; + bool use_time; bool use_intensity; bool gpu_temporary_mem; @@ -179,6 +183,7 @@ protected: void construct_transform(); void reconstruct(); + void load_buffer(); void write_buffer(); }; diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 97bcde99af6..2871bc5761a 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -73,11 +73,13 @@ public: float denoising_feature_strength; bool denoising_relative_pca; bool denoising_from_render; + vector<int> denoising_frames; bool denoising_do_filter; bool denoising_write_passes; int pass_stride; + int frame_stride; int target_pass_stride; int pass_denoising_data; int pass_denoising_clean; diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 4d42ddc0c53..9b763167459 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -422,6 +422,7 @@ protected: bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task); bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index a0a1cf68c32..4417065bb7f 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -821,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + char use_time = task->buffer.use_time? 1 : 0; cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - kernel_set_args(ckFilterConstructTransform, 0, - buffer_mem, + int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterConstructTransform, + arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterConstructTransform, + arg_ofs, transform_mem, rank_mem, task->filter_area, task->rect, task->buffer.pass_stride, + task->buffer.frame_stride, + use_time, task->radius, task->pca_threshold); @@ -845,6 +860,7 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, + int frame, DenoisingTask *task) { cl_mem color_mem = CL_MEM_PTR(color_ptr); @@ -865,6 +881,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + char use_time = task->buffer.use_time? 1 : 0; int r = task->radius; int pass_stride = task->buffer.pass_stride; @@ -884,6 +903,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, pass_stride, r, pass_stride, + frame_offset, 1.0f, task->nlm_k_2); kernel_set_args(ckNLMBlur, 0, difference_mem, @@ -898,6 +918,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, pass_stride, r, 4); kernel_set_args(ckNLMConstructGramian, 0, + t, blurDifference_mem, buffer_mem, transform_mem, @@ -907,7 +928,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, task->reconstruction_state.filter_window, w, h, stride, pass_stride, - r, 4); + r, 4, + frame_offset, + use_time); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); @@ -1108,7 +1131,7 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); |