diff options
author | Lukas Stockner <lukas.stockner@freenet.de> | 2018-10-08 23:13:40 +0300 |
---|---|---|
committer | Lukas Stockner <lukas.stockner@freenet.de> | 2018-10-08 23:13:40 +0300 |
commit | 15e9d80375797dd7ba9779daf6d1a7da5cd6de8e (patch) | |
tree | 0e065f6e43c404f75ba141b7bf1c868c9ba84efd /intern | |
parent | 9756475ed632d868b16352f389fc276a6879b867 (diff) |
Cycles: Use existing shared temporary memory in reconstruction step of the denoiser
Previously the code allocated its own temporary memory, but it's possible to just use the existing shared one instead.
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 23 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 48 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/filter.cu | 23 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/filter.cl | 23 |
4 files changed, 52 insertions, 65 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 7d9a13ecc88..5b46d5a507d 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1397,18 +1397,14 @@ public: int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; - int shift_stride = stride*h; + int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); - int mem_size = sizeof(float)*shift_stride*num_shifts; - - device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem"); - temporary_mem.alloc_to_device(2*mem_size); if(have_error()) return false; - CUdeviceptr difference = cuda_device_ptr(temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + mem_size; + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; { CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; @@ -1426,9 +1422,9 @@ public: task->reconstruction_state.source_w * task->reconstruction_state.source_h, num_shifts); - void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; void *construct_gramian_args[] = {&blurDifference, &task->buffer.mem.device_pointer, &task->storage.transform.device_pointer, @@ -1437,9 +1433,8 @@ public: &task->storage.XtWY.device_pointer, &task->reconstruction_state.filter_window, &w, &h, &stride, - &shift_stride, &r, - &f, - &task->buffer.pass_stride}; + &pass_stride, &r, + &f}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1448,8 +1443,6 @@ public: CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); } - temporary_mem.free(); - { CUfunction cuFinalize; cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index cc887134bb0..75418dad1cc 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -865,38 +865,38 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; - int shift_stride = stride*h; - int num_shifts = (2*task->radius + 1)*(2*task->radius + 1); - int mem_size = sizeof(float)*shift_stride*num_shifts; + int r = task->radius; + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2*r-+1)*(2*r+1); - cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); - cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); + device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); kernel_set_args(ckNLMCalcDifference, 0, color_mem, color_variance_mem, - difference, + difference_mem, w, h, stride, - shift_stride, - task->radius, - task->buffer.pass_stride, + pass_stride, + r, + pass_stride, 1.0f, task->nlm_k_2); kernel_set_args(ckNLMBlur, 0, - difference, - blurDifference, + difference_mem, + blurDifference_mem, w, h, stride, - shift_stride, - task->radius, 4); + pass_stride, + r, 4); kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, - difference, + blurDifference_mem, + difference_mem, w, h, stride, - shift_stride, - task->radius, 4); + pass_stride, + r, 4); kernel_set_args(ckNLMConstructGramian, 0, - blurDifference, + blurDifference_mem, buffer_mem, transform_mem, rank_mem, @@ -904,9 +904,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, XtWY_mem, task->reconstruction_state.filter_window, w, h, stride, - shift_stride, - task->radius, 4, - task->buffer.pass_stride); + pass_stride, + r, 4); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); @@ -914,9 +913,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); - opencl_assert(clReleaseMemObject(difference)); - opencl_assert(clReleaseMemObject(blurDifference)); - kernel_set_args(ckFinalize, 0, output_mem, rank_mem, diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 0561c40e6b1..b856cbde45c 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int channel_offset, float a, @@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, @@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_blur(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_weight(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, difference_image + ofs, image, @@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im int w, int h, int stride, - int shift_stride, + int pass_stride, int r, - int f, - int pass_stride) + int f) { int4 co, rect; int ofs; - if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, difference_image + ofs, diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index 3c75754fb39..a550f97f4eb 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int channel_offset, float a, @@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, @@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_blur(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_weight(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, difference_image + ofs, image, @@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc int w, int h, int stride, - int shift_stride, + int pass_stride, int r, - int f, - int pass_stride) + int f) { int4 co, rect; int ofs; - if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, difference_image + ofs, |