diff options
author | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2019-02-06 17:22:53 +0300 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2019-02-06 17:22:53 +0300 |
commit | e21ae0bb267a54482108ddd4feed99c89241804b (patch) | |
tree | 5d5220c578c0e41533a2a4430018ced6ff13e08c /intern/cycles/device | |
parent | e8292466bcb69282798bba5dd701fff514cb0b78 (diff) | |
parent | fccf506ed7fd96f8a8f5edda7b99f564a386321a (diff) |
Merge branch 'blender2.7'
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 90 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 182 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.cpp | 119 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.h | 27 | ||||
-rw-r--r-- | intern/cycles/device/device_task.h | 8 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl.h | 16 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 100 |
7 files changed, 414 insertions, 128 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index a92c052a5df..1f39a412083 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -180,20 +180,21 @@ public: KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel; - KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel; - KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel; + KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)> filter_write_feature_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; - KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel; + KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; - KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; - KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, int, int, int, int, int, int, int, int, ccl_global int*, int, @@ -218,6 +219,7 @@ public: REGISTER_KERNEL(shader), REGISTER_KERNEL(filter_divide_shadow), REGISTER_KERNEL(filter_get_feature), + REGISTER_KERNEL(filter_write_feature), REGISTER_KERNEL(filter_detect_outliers), REGISTER_KERNEL(filter_combine_halves), REGISTER_KERNEL(filter_nlm_calc_difference), @@ -487,6 +489,8 @@ public: int w = align_up(rect.z-rect.x, 4); int h = rect.w-rect.y; + int stride = task->buffer.stride; + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer; float *blurDifference = temporary_mem; @@ -504,10 +508,11 @@ public: filter_nlm_calc_difference_kernel()(dx, dy, (float*) guide_ptr, (float*) variance_ptr, + NULL, difference, local_rect, - w, 0, - a, k_2); + w, channel_offset, + 0, a, k_2); filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); @@ -520,7 +525,8 @@ public: (float*) out_ptr, weightAccum, local_rect, - w, f); + channel_offset, + stride, f); } int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; @@ -536,6 +542,7 @@ public: for(int y = 0; y < task->filter_area.w; y++) { for(int x = 0; x < task->filter_area.z; x++) { filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + task->tile_info, x + task->filter_area.x, y + task->filter_area.y, y*task->filter_area.z + x, @@ -543,6 +550,8 @@ public: (int*) task->storage.rank.device_pointer, &task->rect.x, task->buffer.pass_stride, + task->buffer.frame_stride, + task->buffer.use_time, task->radius, task->pca_threshold); } @@ -550,21 +559,20 @@ public: return true; } - bool denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task) + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) { ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); - mem_zero(task->storage.XtWX); - mem_zero(task->storage.XtWY); - float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer; float *difference = temporary_mem; float *blurDifference = temporary_mem + task->buffer.pass_stride; int r = task->radius; + int frame_offset = frame * task->buffer.frame_stride; for(int i = 0; i < (2*r+1)*(2*r+1); i++) { int dy = i / (2*r+1) - r; int dx = i % (2*r+1) - r; @@ -575,16 +583,19 @@ public: filter_nlm_calc_difference_kernel()(dx, dy, (float*) color_ptr, (float*) color_variance_ptr, + (float*) scale_ptr, difference, local_rect, task->buffer.stride, task->buffer.pass_stride, + frame_offset, 1.0f, task->nlm_k_2); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4); filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_construct_gramian_kernel()(dx, dy, + task->tile_info->frames[frame], blurDifference, (float*) task->buffer.mem.device_pointer, (float*) task->storage.transform.device_pointer, @@ -595,8 +606,17 @@ public: &task->reconstruction_state.filter_window.x, task->buffer.stride, 4, - task->buffer.pass_stride); + task->buffer.pass_stride, + frame_offset, + task->buffer.use_time); } + + return true; + } + + bool denoising_solve(device_ptr output_ptr, + DenoisingTask *task) + { for(int y = 0; y < task->filter_area.w; y++) { for(int x = 0; x < task->filter_area.z; x++) { filter_finalize_kernel()(x, @@ -661,6 +681,7 @@ public: int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task) { ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE); @@ -674,6 +695,7 @@ public: x, y, (float*) mean_ptr, (float*) variance_ptr, + scale, &task->rect.x, task->render_buffer.pass_stride, task->render_buffer.offset); @@ -682,6 +704,26 @@ public: return true; } + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) + { + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_write_feature_kernel()(task->render_buffer.samples, + x + task->filter_area.x, + y + task->filter_area.y, + &task->reconstruction_state.buffer_params.x, + (float*) from_ptr, + (float*) buffer_ptr, + out_offset, + &task->rect.x); + } + } + return true; + } + bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, @@ -754,11 +796,13 @@ public: tile.sample = tile.start_sample + tile.num_samples; denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 67f5793e793..ada538adf32 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1300,7 +1300,8 @@ public: int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); - int channel_offset = 0; + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; + int frame_offset = 0; if(have_error()) return false; @@ -1308,6 +1309,7 @@ public: CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts; + CUdeviceptr scale_ptr = 0; cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride)); cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride)); @@ -1326,10 +1328,10 @@ public: CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); - void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2}; + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2}; void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1366,32 +1368,33 @@ public: task->storage.h); void *args[] = {&task->buffer.mem.device_pointer, + &task->tile_info_mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, &task->filter_area, &task->rect, &task->radius, &task->pca_threshold, - &task->buffer.pass_stride}; + &task->buffer.pass_stride, + &task->buffer.frame_stride, + &task->buffer.use_time}; CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); cuda_assert(cuCtxSynchronize()); return !have_error(); } - bool denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task) + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) { if(have_error()) return false; CUDAContextScope scope(this); - mem_zero(task->storage.XtWX); - mem_zero(task->storage.XtWY); - int r = task->radius; int f = 4; float a = 1.0f; @@ -1400,6 +1403,8 @@ public: int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); @@ -1410,60 +1415,73 @@ public: CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; - { - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, - task->reconstruction_state.source_w * task->reconstruction_state.source_h, - num_shifts); - - void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&blurDifference, - &task->buffer.mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->reconstruction_state.filter_window, - &w, &h, &stride, - &pass_stride, &r, - &f}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); - } + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, + &color_variance_ptr, + &scale_ptr, + &difference, + &w, &h, + &stride, &pass_stride, + &r, &pass_stride, + &frame_offset, + &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *construct_gramian_args[] = {&t, + &blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->reconstruction_state.filter_window, + &w, &h, &stride, + &pass_stride, &r, + &f, + &frame_offset, + &task->buffer.use_time}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); + cuda_assert(cuCtxSynchronize()); - { - CUfunction cuFinalize; - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - void *finalize_args[] = {&output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_GET_BLOCKSIZE(cuFinalize, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); - } + return !have_error(); + } + bool denoising_solve(device_ptr output_ptr, + DenoisingTask *task) + { + CUfunction cuFinalize; + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE(cuFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); cuda_assert(cuCtxSynchronize()); return !have_error(); @@ -1533,6 +1551,7 @@ public: int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task) { if(have_error()) @@ -1553,6 +1572,7 @@ public: &variance_offset, &mean_ptr, &variance_ptr, + &scale, &task->rect, &task->render_buffer.pass_stride, &task->render_buffer.offset}; @@ -1562,6 +1582,36 @@ public: return !have_error(); } + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterWriteFeature; + cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, + task->filter_area.z, + task->filter_area.w); + + void *args[] = {&task->render_buffer.samples, + &task->reconstruction_state.buffer_params, + &task->filter_area, + &from_ptr, + &buffer_ptr, + &out_offset, + &task->rect}; + CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, @@ -1596,11 +1646,13 @@ public: void denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 433cbd3c265..61e0ba47ab8 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -36,14 +36,28 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); } + render_buffer.frame_stride = task.frame_stride; render_buffer.pass_stride = task.pass_stride; render_buffer.offset = task.pass_denoising_data; - target_buffer.pass_stride = task.pass_stride; + target_buffer.pass_stride = task.target_pass_stride; target_buffer.denoising_clean_offset = task.pass_denoising_clean; + target_buffer.offset = 0; functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device); functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device); + + tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); + tile_info->from_render = task.denoising_from_render? 1 : 0; + + tile_info->frames[0] = 0; + tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES); + for(int i = 1; i < tile_info->num_frames; i++) { + tile_info->frames[i] = task.denoising_frames[i-1]; + } + + write_passes = task.denoising_write_passes; + do_filter = task.denoising_do_filter; } DenoisingTask::~DenoisingTask() @@ -59,8 +73,6 @@ DenoisingTask::~DenoisingTask() void DenoisingTask::set_render_buffer(RenderTile *rtiles) { - tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); - for(int i = 0; i < 9; i++) { tile_info->offsets[i] = rtiles[i].offset; tile_info->strides[i] = rtiles[i].stride; @@ -79,6 +91,13 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles) target_buffer.stride = rtiles[9].stride; target_buffer.ptr = rtiles[9].buffer; + if(write_passes && rtiles[9].buffers) { + target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset(); + } + else { + target_buffer.denoising_output_offset = 0; + } + tile_info_mem.copy_to_device(); } @@ -89,15 +108,18 @@ void DenoisingTask::setup_denoising_buffer() rect = rect_expand(rect, radius); rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - buffer.passes = 14; + buffer.use_intensity = write_passes || (tile_info->num_frames > 1); + buffer.passes = buffer.use_intensity? 15 : 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); + buffer.frame_stride = buffer.pass_stride * buffer.passes; /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ - int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats); + int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats); buffer.mem.alloc_to_device(mem_size, false); + buffer.use_time = (tile_info->num_frames > 1); /* CPUs process shifts sequentially while GPUs process them in parallel. */ int num_layers; @@ -129,14 +151,14 @@ void DenoisingTask::prefilter_shadowing() functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ - nlm_state.set_parameters(6, 3, 4.0f, 1.0f); + nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false); functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); /* Reuse memory, the previous data isn't needed anymore. */ device_ptr filtered_a = *buffer_var, filtered_b = *sample_var; /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ - nlm_state.set_parameters(5, 3, 1.0f, 0.25f); + nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false); functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); @@ -147,7 +169,7 @@ void DenoisingTask::prefilter_shadowing() device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b; /* Use the residual variance for a second filter pass. */ - nlm_state.set_parameters(4, 2, 1.0f, 0.5f); + nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false); functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); @@ -167,9 +189,9 @@ void DenoisingTask::prefilter_features() for(int pass = 0; pass < 7; pass++) { device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride); /* Get the unfiltered pass and its variance from the RenderBuffers. */ - functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance); + functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples); /* Smooth the pass and store the result in the denoising buffers. */ - nlm_state.set_parameters(2, 2, 1.0f, 0.25f); + nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false); functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); } } @@ -188,13 +210,52 @@ void DenoisingTask::prefilter_color() for(int pass = 0; pass < num_color_passes; pass++) { device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride); device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride); - functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass); + functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples); } device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride); device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride); functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); + + if(buffer.use_intensity) { + device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); + nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true); + functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass); + } +} + +void DenoisingTask::load_buffer() +{ + device_ptr null_ptr = (device_ptr) 0; + + int original_offset = render_buffer.offset; + + int num_passes = buffer.use_intensity? 15 : 14; + for(int i = 0; i < tile_info->num_frames; i++) { + for(int pass = 0; pass < num_passes; pass++) { + device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride); + bool is_variance = (pass >= 11) && (pass <= 13); + functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f); + } + render_buffer.offset += render_buffer.frame_stride; + } + + render_buffer.offset = original_offset; +} + +void DenoisingTask::write_buffer() +{ + reconstruction_state.buffer_params = make_int4(target_buffer.offset, + target_buffer.stride, + target_buffer.pass_stride, + target_buffer.denoising_clean_offset); + int num_passes = buffer.use_intensity? 15 : 14; + for(int pass = 0; pass < num_passes; pass++) { + device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride); + int out_offset = pass + target_buffer.denoising_output_offset; + functions.write_feature(out_offset, *from_pass, target_buffer.ptr); + } } void DenoisingTask::construct_transform() @@ -212,6 +273,8 @@ void DenoisingTask::reconstruct() { storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false); storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false); + storage.XtWX.zero_to_device(); + storage.XtWY.zero_to_device(); reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x; @@ -224,7 +287,18 @@ void DenoisingTask::reconstruct() device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); - functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr); + for(int f = 0; f < tile_info->num_frames; f++) { + device_ptr scale_ptr = 0; + device_sub_ptr *scale_sub_ptr = NULL; + if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) { + scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); + scale_ptr = **scale_sub_ptr; + } + + functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f); + delete scale_sub_ptr; + } + functions.solve(target_buffer.ptr); } void DenoisingTask::run_denoising(RenderTile *tile) @@ -236,12 +310,23 @@ void DenoisingTask::run_denoising(RenderTile *tile) setup_denoising_buffer(); - prefilter_shadowing(); - prefilter_features(); - prefilter_color(); + if(tile_info->from_render) { + prefilter_shadowing(); + prefilter_features(); + prefilter_color(); + } + else { + load_buffer(); + } + + if(do_filter) { + construct_transform(); + reconstruct(); + } - construct_transform(); - reconstruct(); + if(write_passes) { + write_buffer(); + } functions.unmap_neighbor_tiles(rtiles); } diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index beae60c220f..5869aa05390 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -38,6 +38,7 @@ public: struct RenderBuffers { int offset; int pass_stride; + int frame_stride; int samples; } render_buffer; @@ -47,6 +48,7 @@ public: int stride; int pass_stride; int denoising_clean_offset; + int denoising_output_offset; device_ptr ptr; } target_buffer; @@ -58,6 +60,9 @@ public: int4 rect; int4 filter_area; + bool write_passes; + bool do_filter; + struct DeviceFunctions { function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */ device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ @@ -66,8 +71,10 @@ public: )> non_local_means; function<bool(device_ptr color_ptr, device_ptr color_variance_ptr, - device_ptr output_ptr - )> reconstruct; + device_ptr scale_ptr, + int frame + )> accumulate; + function<bool(device_ptr output_ptr)> solve; function<bool()> construct_transform; function<bool(device_ptr a_ptr, @@ -86,13 +93,18 @@ public: function<bool(int mean_offset, int variance_offset, device_ptr mean_ptr, - device_ptr variance_ptr + device_ptr variance_ptr, + float scale )> get_feature; function<bool(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, device_ptr output_ptr )> detect_outliers; + function<bool(int out_offset, + device_ptr frop_ptr, + device_ptr buffer_ptr + )> write_feature; function<void(RenderTile *rtiles)> map_neighbor_tiles; function<void(RenderTile *rtiles)> unmap_neighbor_tiles; } functions; @@ -114,8 +126,9 @@ public: int f; /* Patch size of the filter. */ float a; /* Variance compensation factor in the MSE estimation. */ float k_2; /* Squared value of the k parameter of the filter. */ + bool is_color; - void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; } + void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; } } nlm_state; struct Storage { @@ -145,8 +158,11 @@ public: int stride; int h; int width; + int frame_stride; device_only_memory<float> mem; device_only_memory<float> temporary_mem; + bool use_time; + bool use_intensity; bool gpu_temporary_mem; @@ -166,6 +182,9 @@ protected: void prefilter_color(); void construct_transform(); void reconstruct(); + + void load_buffer(); + void write_buffer(); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 861014373b3..2871bc5761a 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -72,7 +72,15 @@ public: float denoising_strength; float denoising_feature_strength; bool denoising_relative_pca; + bool denoising_from_render; + vector<int> denoising_frames; + + bool denoising_do_filter; + bool denoising_write_passes; + int pass_stride; + int frame_stride; + int target_pass_stride; int pass_denoising_data; int pass_denoising_clean; diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index ea7ed4f1909..9b763167459 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -419,10 +419,13 @@ protected: device_ptr out_ptr, DenoisingTask *task); bool denoising_construct_transform(DenoisingTask *task); - bool denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task); + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task); + bool denoising_solve(device_ptr output_ptr, + DenoisingTask *task); bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, @@ -439,7 +442,12 @@ protected: int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task); + bool denoising_write_feature(int to_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task); bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index d4d7c0f74bc..4417065bb7f 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -748,6 +748,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); @@ -760,6 +761,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, cl_mem guide_mem = CL_MEM_PTR(guide_ptr); cl_mem variance_mem = CL_MEM_PTR(variance_ptr); cl_mem out_mem = CL_MEM_PTR(out_ptr); + cl_mem scale_mem = NULL; mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride); mem_zero_kernel(out_ptr, sizeof(float)*pass_stride); @@ -773,10 +775,12 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, kernel_set_args(ckNLMCalcDifference, 0, guide_mem, variance_mem, + scale_mem, difference_mem, w, h, stride, pass_stride, - r, 0, a, k_2); + r, channel_offset, + 0, a, k_2); kernel_set_args(ckNLMBlur, 0, difference_mem, blurDifference_mem, @@ -796,6 +800,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, weightAccum_mem, w, h, stride, pass_stride, + channel_offset, r, f); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); @@ -816,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + char use_time = task->buffer.use_time? 1 : 0; cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - kernel_set_args(ckFilterConstructTransform, 0, - buffer_mem, + int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterConstructTransform, + arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterConstructTransform, + arg_ofs, transform_mem, rank_mem, task->filter_area, task->rect, task->buffer.pass_stride, + task->buffer.frame_stride, + use_time, task->radius, task->pca_threshold); @@ -837,17 +857,15 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) return true; } -bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr output_ptr, - DenoisingTask *task) +bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) { - mem_zero(task->storage.XtWX); - mem_zero(task->storage.XtWY); - cl_mem color_mem = CL_MEM_PTR(color_ptr); cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); - cl_mem output_mem = CL_MEM_PTR(output_ptr); + cl_mem scale_mem = CL_MEM_PTR(scale_ptr); cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); @@ -859,11 +877,13 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); - cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); int w = task->reconstruction_state.source_w; int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + char use_time = task->buffer.use_time? 1 : 0; int r = task->radius; int pass_stride = task->buffer.pass_stride; @@ -877,11 +897,13 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, kernel_set_args(ckNLMCalcDifference, 0, color_mem, color_variance_mem, + scale_mem, difference_mem, w, h, stride, pass_stride, r, pass_stride, + frame_offset, 1.0f, task->nlm_k_2); kernel_set_args(ckNLMBlur, 0, difference_mem, @@ -896,6 +918,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, pass_stride, r, 4); kernel_set_args(ckNLMConstructGramian, 0, + t, blurDifference_mem, buffer_mem, transform_mem, @@ -905,7 +928,9 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, task->reconstruction_state.filter_window, w, h, stride, pass_stride, - r, 4); + r, 4, + frame_offset, + use_time); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); @@ -913,6 +938,22 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); + return true; +} + +bool OpenCLDeviceBase::denoising_solve(device_ptr output_ptr, + DenoisingTask *task) +{ + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem output_mem = CL_MEM_PTR(output_ptr); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + kernel_set_args(ckFinalize, 0, output_mem, rank_mem, @@ -1000,6 +1041,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, + float scale, DenoisingTask *task) { cl_mem mean_mem = CL_MEM_PTR(mean_ptr); @@ -1023,6 +1065,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, variance_offset, mean_mem, variance_mem, + scale, task->rect, task->render_buffer.pass_stride, task->render_buffer.offset); @@ -1033,6 +1076,31 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, return true; } +bool OpenCLDeviceBase::denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) +{ + cl_mem from_mem = CL_MEM_PTR(from_ptr); + cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); + + cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); + + kernel_set_args(ckFilterWriteFeature, 0, + task->render_buffer.samples, + task->reconstruction_state.buffer_params, + task->filter_area, + from_mem, + buffer_mem, + out_offset, + task->rect); + enqueue_kernel(ckFilterWriteFeature, + task->filter_area.z, + task->filter_area.w); + + return true; +} + bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, @@ -1063,11 +1131,13 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&OpenCLDeviceBase::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); |