diff options
Diffstat (limited to 'intern/cycles/device/opencl/opencl_base.cpp')
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 225 |
1 files changed, 117 insertions, 108 deletions
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index f43177247ef..fe084edc90e 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -560,7 +560,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) return global_size + ((r == 0)? 0: group_size - r); } -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size) +void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) { size_t workgroup_size, max_work_items[3]; @@ -574,8 +574,15 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size } /* Try to divide evenly over 2 dimensions. */ - size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); - size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; + size_t local_size[2]; + if(x_workgroups) { + local_size[0] = workgroup_size; + local_size[1] = 1; + } + else { + size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); + local_size[0] = local_size[1] = sqrt_workgroup_size; + } /* Some implementations have max size 1 on 2nd dimension. */ if(local_size[1] > max_work_items[1]) { @@ -731,17 +738,25 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, device_ptr out_ptr, DenoisingTask *task) { - int4 rect = task->rect; - int w = rect.z-rect.x; - int h = rect.w-rect.y; + + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; int r = task->nlm_state.r; int f = task->nlm_state.f; float a = task->nlm_state.a; float k_2 = task->nlm_state.k_2; - cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr); - cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr); - cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + + cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + + cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means"); + cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means"); cl_mem image_mem = CL_MEM_PTR(image_ptr); cl_mem guide_mem = CL_MEM_PTR(guide_ptr); @@ -757,31 +772,45 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); - kernel_set_args(ckNLMCalcDifference, 0, - dx, dy, guide_mem, variance_mem, - difference, local_rect, w, 0, a, k_2); - kernel_set_args(ckNLMBlur, 0, - difference, blurDifference, local_rect, w, f); - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, difference, local_rect, w, f); - kernel_set_args(ckNLMUpdateOutput, 0, - dx, dy, blurDifference, image_mem, - out_mem, weightAccum, local_rect, w, f); - - enqueue_kernel(ckNLMCalcDifference, w, h); - enqueue_kernel(ckNLMBlur, w, h); - enqueue_kernel(ckNLMCalcWeight, w, h); - enqueue_kernel(ckNLMBlur, w, h); - enqueue_kernel(ckNLMUpdateOutput, w, h); - } + kernel_set_args(ckNLMCalcDifference, 0, + guide_mem, + variance_mem, + difference, + w, h, stride, + shift_stride, + r, 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + w, h, stride, + shift_stride, + r, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + w, h, stride, + shift_stride, + r, f); + kernel_set_args(ckNLMUpdateOutput, 0, + blurDifference, + image_mem, + out_mem, + weightAccum, + w, h, stride, + shift_stride, + r, f); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true); + + opencl_assert(clReleaseMemObject(difference)); + opencl_assert(clReleaseMemObject(blurDifference)); - int4 local_rect = make_int4(0, 0, w, h); kernel_set_args(ckNLMNormalize, 0, - out_mem, weightAccum, local_rect, w); + out_mem, weightAccum, w, h, stride); enqueue_kernel(ckNLMNormalize, w, h); return true; @@ -837,81 +866,63 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); - cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr); - cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr); - - int r = task->radius; - int f = 4; - float a = 1.0f; - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - - int local_rect[4] = {max(0, -dx), max(0, -dy), - task->reconstruction_state.source_w - max(0, dx), - task->reconstruction_state.source_h - max(0, dy)}; - - kernel_set_args(ckNLMCalcDifference, 0, - dx, dy, - color_mem, - color_variance_mem, - difference, - local_rect, - task->buffer.w, - task->buffer.pass_stride, - a, task->nlm_k_2); - enqueue_kernel(ckNLMCalcDifference, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - kernel_set_args(ckNLMBlur, 0, - difference, - blurDifference, - local_rect, - task->buffer.w, - f); - enqueue_kernel(ckNLMBlur, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, - difference, - local_rect, - task->buffer.w, - f); - enqueue_kernel(ckNLMCalcWeight, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - /* Reuse previous arguments. */ - enqueue_kernel(ckNLMBlur, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - kernel_set_args(ckNLMConstructGramian, 0, - dx, dy, - blurDifference, - buffer_mem, - transform_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - local_rect, - task->reconstruction_state.filter_rect, - task->buffer.w, - task->buffer.h, - f, - task->buffer.pass_stride); - enqueue_kernel(ckNLMConstructGramian, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h, - 256); - } + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + + int shift_stride = stride*h; + int num_shifts = (2*task->radius + 1)*(2*task->radius + 1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + + cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + + kernel_set_args(ckNLMCalcDifference, 0, + color_mem, + color_variance_mem, + difference, + w, h, stride, + shift_stride, + task->radius, + task->buffer.pass_stride, + 1.0f, task->nlm_k_2); + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + w, h, stride, + shift_stride, + task->radius, 4); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + w, h, stride, + shift_stride, + task->radius, 4); + kernel_set_args(ckNLMConstructGramian, 0, + blurDifference, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->reconstruction_state.filter_window, + w, h, stride, + shift_stride, + task->radius, 4, + task->buffer.pass_stride); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); + + opencl_assert(clReleaseMemObject(difference)); + opencl_assert(clReleaseMemObject(blurDifference)); kernel_set_args(ckFinalize, 0, - task->buffer.w, - task->buffer.h, output_mem, rank_mem, XtWX_mem, @@ -919,9 +930,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, task->filter_area, task->reconstruction_state.buffer_params, task->render_buffer.samples); - enqueue_kernel(ckFinalize, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); + enqueue_kernel(ckFinalize, w, h); return true; } |