diff options
author | Lukas Stockner <lukas.stockner@freenet.de> | 2018-08-25 22:19:44 +0300 |
---|---|---|
committer | Lukas Stockner <lukas.stockner@freenet.de> | 2018-08-25 22:23:52 +0300 |
commit | 94efc651d40fe62417b605e2f400fd364fb0d8ef (patch) | |
tree | 27e02bd43309f6f4a213ee94fcbfb75828cf1d50 /intern/cycles/device/device_cuda.cpp | |
parent | 60a5ba265cbda5293a21eaeab1d65ba155d66e03 (diff) |
Cycles Denoiser: Allocate a single temporary buffer for the entire denoising process
With small tiles, the repeated allocations on GPUs can actually slow down the denoising quite a lot.
Allocating the buffer just once reduces rendertime for the default cube with 16x16 tiles and denoising on a mobile 1050 from 22.7sec to 14.0sec.
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 27 |
1 files changed, 11 insertions, 16 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 73c9697223d..da8e49f129f 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1294,23 +1294,19 @@ public: float a = task->nlm_state.a; float k_2 = task->nlm_state.k_2; - int shift_stride = stride*h; + int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); - int mem_size = sizeof(float)*shift_stride*num_shifts; int channel_offset = 0; - device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem"); - temporary_mem.alloc_to_device(2*mem_size); - if(have_error()) return false; - CUdeviceptr difference = cuda_device_ptr(temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + mem_size; + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; + CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts; - CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; - cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride)); - cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride)); + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride)); { CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; @@ -1326,10 +1322,10 @@ public: CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); - void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &r, &f}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1338,8 +1334,6 @@ public: CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); } - temporary_mem.free(); - { CUfunction cuNLMNormalize; cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); @@ -1614,6 +1608,7 @@ public: denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); denoising.render_buffer.samples = rtile.sample; + denoising.buffer.gpu_temporary_mem = true; denoising.run_denoising(&rtile); } |