diff options
author | Lukas Stockner <lukas.stockner@freenet.de> | 2018-10-08 23:13:40 +0300 |
---|---|---|
committer | Lukas Stockner <lukas.stockner@freenet.de> | 2018-10-08 23:13:40 +0300 |
commit | 15e9d80375797dd7ba9779daf6d1a7da5cd6de8e (patch) | |
tree | 0e065f6e43c404f75ba141b7bf1c868c9ba84efd | |
parent | 9756475ed632d868b16352f389fc276a6879b867 (diff) |
Cycles: Use existing shared temporary memory in reconstruction step of the denoiser
Previously the code allocated its own temporary memory, but it's possible to just use the existing shared one instead.
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 23 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 48 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/filter.cu | 23 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/filter.cl | 23 |
4 files changed, 52 insertions, 65 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 7d9a13ecc88..5b46d5a507d 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1397,18 +1397,14 @@ public: int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; - int shift_stride = stride*h; + int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); - int mem_size = sizeof(float)*shift_stride*num_shifts; - - device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem"); - temporary_mem.alloc_to_device(2*mem_size); if(have_error()) return false; - CUdeviceptr difference = cuda_device_ptr(temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + mem_size; + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; { CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; @@ -1426,9 +1422,9 @@ public: task->reconstruction_state.source_w * task->reconstruction_state.source_h, num_shifts); - void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; void *construct_gramian_args[] = {&blurDifference, &task->buffer.mem.device_pointer, &task->storage.transform.device_pointer, @@ -1437,9 +1433,8 @@ public: &task->storage.XtWY.device_pointer, &task->reconstruction_state.filter_window, &w, &h, &stride, - &shift_stride, &r, - &f, - &task->buffer.pass_stride}; + &pass_stride, &r, + &f}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1448,8 +1443,6 @@ public: CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); } - temporary_mem.free(); - { CUfunction cuFinalize; cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index cc887134bb0..75418dad1cc 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -865,38 +865,38 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; - int shift_stride = stride*h; - int num_shifts = (2*task->radius + 1)*(2*task->radius + 1); - int mem_size = sizeof(float)*shift_stride*num_shifts; + int r = task->radius; + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2*r-+1)*(2*r+1); - cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); - cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); + device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); kernel_set_args(ckNLMCalcDifference, 0, color_mem, color_variance_mem, - difference, + difference_mem, w, h, stride, - shift_stride, - task->radius, - task->buffer.pass_stride, + pass_stride, + r, + pass_stride, 1.0f, task->nlm_k_2); kernel_set_args(ckNLMBlur, 0, - difference, - blurDifference, + difference_mem, + blurDifference_mem, w, h, stride, - shift_stride, - task->radius, 4); + pass_stride, + r, 4); kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, - difference, + blurDifference_mem, + difference_mem, w, h, stride, - shift_stride, - task->radius, 4); + pass_stride, + r, 4); kernel_set_args(ckNLMConstructGramian, 0, - blurDifference, + blurDifference_mem, buffer_mem, transform_mem, rank_mem, @@ -904,9 +904,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, XtWY_mem, task->reconstruction_state.filter_window, w, h, stride, - shift_stride, - task->radius, 4, - task->buffer.pass_stride); + pass_stride, + r, 4); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); @@ -914,9 +913,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); - opencl_assert(clReleaseMemObject(difference)); - opencl_assert(clReleaseMemObject(blurDifference)); - kernel_set_args(ckFinalize, 0, output_mem, rank_mem, diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 0561c40e6b1..b856cbde45c 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int channel_offset, float a, @@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, @@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_blur(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_weight(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, difference_image + ofs, image, @@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im int w, int h, int stride, - int shift_stride, + int pass_stride, int r, - int f, - int pass_stride) + int f) { int4 co, rect; int ofs; - if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, difference_image + ofs, diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index 3c75754fb39..a550f97f4eb 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int channel_offset, float a, @@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, @@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_blur(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_weight(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, difference_image + ofs, image, @@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc int w, int h, int stride, - int shift_stride, + int pass_stride, int r, - int f, - int pass_stride) + int f) { int4 co, rect; int ofs; - if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, difference_image + ofs, |