diff options
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 21 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 244 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.cpp | 24 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.h | 5 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl.h | 4 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 225 |
6 files changed, 271 insertions, 252 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 999b9230d29..2d28ccd2b49 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -190,9 +190,9 @@ public: KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; - KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel; - KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, int, int, int, int, int, int, int, int, ccl_global int*, int, @@ -565,13 +565,13 @@ public: (float*) color_variance_ptr, difference, local_rect, - task->buffer.w, + task->buffer.stride, task->buffer.pass_stride, 1.0f, task->nlm_k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_construct_gramian_kernel()(dx, dy, blurDifference, (float*) task->buffer.mem.device_pointer, @@ -580,9 +580,8 @@ public: (float*) task->storage.XtWX.device_pointer, (float3*) task->storage.XtWY.device_pointer, local_rect, - &task->reconstruction_state.filter_rect.x, - task->buffer.w, - task->buffer.h, + &task->reconstruction_state.filter_window.x, + task->buffer.stride, 4, task->buffer.pass_stride); } @@ -591,8 +590,6 @@ public: filter_finalize_kernel()(x, y, y*task->filter_area.z + x, - task->buffer.w, - task->buffer.h, (float*) output_ptr, (int*) task->storage.rank.device_pointer, (float*) task->storage.XtWX.device_pointer, diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index d8d787ba706..a663da748df 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1087,6 +1087,19 @@ public: threads, threads, 1, \ 0, 0, args, 0)); +/* Similar as above, but for 1-dimensional blocks. */ +#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int xblocks = ((w) + threads_per_block - 1)/threads_per_block; \ + int yblocks = h; + +#define CUDA_LAUNCH_KERNEL_1D(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads_per_block, 1, 1, \ + 0, 0, args, 0)); + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, DenoisingTask *task) { @@ -1095,60 +1108,65 @@ public: CUDAContextScope scope(this); - int4 rect = task->rect; - int w = align_up(rect.z-rect.x, 4); - int h = rect.w-rect.y; + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; int r = task->nlm_state.r; int f = task->nlm_state.f; float a = task->nlm_state.a; float k_2 = task->nlm_state.k_2; - CUdeviceptr difference = task->nlm_state.temporary_1_ptr; - CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr; - CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*2*num_shifts; + int channel_offset = 0; - cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h)); - cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h)); + CUdeviceptr temporary_mem; + cuda_assert(cuMemAlloc(&temporary_mem, mem_size)); + CUdeviceptr difference = temporary_mem; + CUdeviceptr blurDifference = temporary_mem + sizeof(float)*shift_stride * num_shifts; - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - int dx, dy; - int4 local_rect; - int channel_offset = 0; - void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; - void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f}; - - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - dy = i / (2*r+1) - r; - dx = i % (2*r+1) - r; - local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); - - CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args); - } - - local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y); - void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w}; - CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); - cuda_assert(cuCtxSynchronize()); + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); + + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &shift_stride, &r, &f}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); + } + + cuMemFree(temporary_mem); + + { + CUfunction cuNLMNormalize; + cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; + CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + } return !have_error(); } @@ -1194,91 +1212,81 @@ public: mem_zero(task->storage.XtWX); mem_zero(task->storage.XtWY); - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + int r = task->radius; + int f = 4; + float a = 1.0f; + float k_2 = task->nlm_k_2; - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; - CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*num_shifts; - CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr; - CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr; + CUdeviceptr temporary_mem; + cuda_assert(cuMemAlloc(&temporary_mem, 2*mem_size)); + CUdeviceptr difference = temporary_mem; + CUdeviceptr blurDifference = temporary_mem + mem_size; - int r = task->radius; - int f = 4; - float a = 1.0f; - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - - int local_rect[4] = {max(0, -dx), max(0, -dy), - task->reconstruction_state.source_w - max(0, dx), - task->reconstruction_state.source_h - max(0, dy)}; - - void *calc_difference_args[] = {&dx, &dy, - &color_ptr, - &color_variance_ptr, - &difference, - &local_rect, - &task->buffer.w, - &task->buffer.pass_stride, - &a, - &task->nlm_k_2}; - CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); - - void *blur_args[] = {&difference, - &blurDifference, - &local_rect, - &task->buffer.w, - &f}; - CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); - - void *calc_weight_args[] = {&blurDifference, - &difference, - &local_rect, - &task->buffer.w, - &f}; - CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); - - /* Reuse previous arguments. */ - CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); - - void *construct_gramian_args[] = {&dx, &dy, - &blurDifference, + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *construct_gramian_args[] = {&blurDifference, &task->buffer.mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, &task->storage.XtWX.device_pointer, &task->storage.XtWY.device_pointer, - &local_rect, - &task->reconstruction_state.filter_rect, - &task->buffer.w, - &task->buffer.h, + &task->reconstruction_state.filter_window, + &w, &h, &stride, + &shift_stride, &r, &f, &task->buffer.pass_stride}; - CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args); - } - - void *finalize_args[] = {&task->buffer.w, - &task->buffer.h, - &output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); + } + + cuMemFree(temporary_mem); + + { + CUfunction cuFinalize; + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE(cuFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + } + cuda_assert(cuCtxSynchronize()); return !have_error(); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 69c43e4a8cf..1862deb9a61 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -57,10 +57,9 @@ void DenoisingTask::init_from_devicetask(const DeviceTask &task) render_buffer.denoising_clean_offset = task.pass_denoising_clean; /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ - rect = make_int4(max(tiles->x[0], filter_area.x - radius), - max(tiles->y[0], filter_area.y - radius), - min(tiles->x[3], filter_area.x + filter_area.z + radius), - min(tiles->y[3], filter_area.y + filter_area.w + radius)); + rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); + rect = rect_expand(rect, radius); + rect = rect_clip(rect, make_int4(tiles->x[0], tiles->y[0], tiles->x[3], tiles->y[3])); } void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles) @@ -93,9 +92,10 @@ bool DenoisingTask::run_denoising() { /* Allocate denoising buffer. */ buffer.passes = 14; - buffer.w = align_up(rect.z - rect.x, 4); + buffer.width = rect.z - rect.x; + buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; - buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float))); + buffer.pass_stride = align_up(buffer.stride * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float))); buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false); device_ptr null_ptr = (device_ptr) 0; @@ -203,15 +203,17 @@ bool DenoisingTask::run_denoising() functions.construct_transform(); - storage.temporary_1.alloc_to_device(buffer.w*buffer.h, false); - storage.temporary_2.alloc_to_device(buffer.w*buffer.h, false); - reconstruction_state.temporary_1_ptr = storage.temporary_1.device_pointer; - reconstruction_state.temporary_2_ptr = storage.temporary_2.device_pointer; + device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1"); + device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2"); + temporary_1.alloc_to_device(buffer.pass_stride, false); + temporary_2.alloc_to_device(buffer.pass_stride, false); + reconstruction_state.temporary_1_ptr = temporary_1.device_pointer; + reconstruction_state.temporary_2_ptr = temporary_2.device_pointer; storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false); storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false); - reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); + reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x; reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset, render_buffer.stride, diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index ec4e7933cdc..77a82d0ad04 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -94,7 +94,7 @@ public: device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */ device_ptr temporary_2_ptr; - int4 filter_rect; + int4 filter_window; int4 buffer_params; int source_w; @@ -148,8 +148,9 @@ public: struct DenoiseBuffers { int pass_stride; int passes; - int w; + int stride; int h; + int width; device_only_memory<float> mem; DenoiseBuffers(Device *device) diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index c02f8ffafe6..f38c2f65c1e 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -353,7 +353,9 @@ public: void tex_free(device_memory& mem); size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1); + void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, + bool x_workgroups = false, + size_t max_workgroup_size = -1); void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index f43177247ef..fe084edc90e 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -560,7 +560,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) return global_size + ((r == 0)? 0: group_size - r); } -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size) +void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) { size_t workgroup_size, max_work_items[3]; @@ -574,8 +574,15 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size } /* Try to divide evenly over 2 dimensions. */ - size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); - size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; + size_t local_size[2]; + if(x_workgroups) { + local_size[0] = workgroup_size; + local_size[1] = 1; + } + else { + size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); + local_size[0] = local_size[1] = sqrt_workgroup_size; + } /* Some implementations have max size 1 on 2nd dimension. */ if(local_size[1] > max_work_items[1]) { @@ -731,17 +738,25 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, device_ptr out_ptr, DenoisingTask *task) { - int4 rect = task->rect; - int w = rect.z-rect.x; - int h = rect.w-rect.y; + + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; int r = task->nlm_state.r; int f = task->nlm_state.f; float a = task->nlm_state.a; float k_2 = task->nlm_state.k_2; - cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr); - cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr); - cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + + cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + + cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means"); + cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means"); cl_mem image_mem = CL_MEM_PTR(image_ptr); cl_mem guide_mem = CL_MEM_PTR(guide_ptr); @@ -757,31 +772,45 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); - kernel_set_args(ckNLMCalcDifference, 0, - dx, dy, guide_mem, variance_mem, - difference, local_rect, w, 0, a, k_2); - kernel_set_args(ckNLMBlur, 0, - difference, blurDifference, local_rect, w, f); - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, difference, local_rect, w, f); - kernel_set_args(ckNLMUpdateOutput, 0, - dx, dy, blurDifference, image_mem, - out_mem, weightAccum, local_rect, w, f); - - enqueue_kernel(ckNLMCalcDifference, w, h); - enqueue_kernel(ckNLMBlur, w, h); - enqueue_kernel(ckNLMCalcWeight, w, h); - enqueue_kernel(ckNLMBlur, w, h); - enqueue_kernel(ckNLMUpdateOutput, w, h); - } + kernel_set_args(ckNLMCalcDifference, 0, + guide_mem, + variance_mem, + difference, + w, h, stride, + shift_stride, + r, 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + w, h, stride, + shift_stride, + r, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + w, h, stride, + shift_stride, + r, f); + kernel_set_args(ckNLMUpdateOutput, 0, + blurDifference, + image_mem, + out_mem, + weightAccum, + w, h, stride, + shift_stride, + r, f); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true); + + opencl_assert(clReleaseMemObject(difference)); + opencl_assert(clReleaseMemObject(blurDifference)); - int4 local_rect = make_int4(0, 0, w, h); kernel_set_args(ckNLMNormalize, 0, - out_mem, weightAccum, local_rect, w); + out_mem, weightAccum, w, h, stride); enqueue_kernel(ckNLMNormalize, w, h); return true; @@ -837,81 +866,63 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); - cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr); - cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr); - - int r = task->radius; - int f = 4; - float a = 1.0f; - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - - int local_rect[4] = {max(0, -dx), max(0, -dy), - task->reconstruction_state.source_w - max(0, dx), - task->reconstruction_state.source_h - max(0, dy)}; - - kernel_set_args(ckNLMCalcDifference, 0, - dx, dy, - color_mem, - color_variance_mem, - difference, - local_rect, - task->buffer.w, - task->buffer.pass_stride, - a, task->nlm_k_2); - enqueue_kernel(ckNLMCalcDifference, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - kernel_set_args(ckNLMBlur, 0, - difference, - blurDifference, - local_rect, - task->buffer.w, - f); - enqueue_kernel(ckNLMBlur, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, - difference, - local_rect, - task->buffer.w, - f); - enqueue_kernel(ckNLMCalcWeight, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - /* Reuse previous arguments. */ - enqueue_kernel(ckNLMBlur, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - kernel_set_args(ckNLMConstructGramian, 0, - dx, dy, - blurDifference, - buffer_mem, - transform_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - local_rect, - task->reconstruction_state.filter_rect, - task->buffer.w, - task->buffer.h, - f, - task->buffer.pass_stride); - enqueue_kernel(ckNLMConstructGramian, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h, - 256); - } + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + + int shift_stride = stride*h; + int num_shifts = (2*task->radius + 1)*(2*task->radius + 1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + + cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + + kernel_set_args(ckNLMCalcDifference, 0, + color_mem, + color_variance_mem, + difference, + w, h, stride, + shift_stride, + task->radius, + task->buffer.pass_stride, + 1.0f, task->nlm_k_2); + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + w, h, stride, + shift_stride, + task->radius, 4); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + w, h, stride, + shift_stride, + task->radius, 4); + kernel_set_args(ckNLMConstructGramian, 0, + blurDifference, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->reconstruction_state.filter_window, + w, h, stride, + shift_stride, + task->radius, 4, + task->buffer.pass_stride); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); + + opencl_assert(clReleaseMemObject(difference)); + opencl_assert(clReleaseMemObject(blurDifference)); kernel_set_args(ckFinalize, 0, - task->buffer.w, - task->buffer.h, output_mem, rank_mem, XtWX_mem, @@ -919,9 +930,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, task->filter_area, task->reconstruction_state.buffer_params, task->render_buffer.samples); - enqueue_kernel(ckFinalize, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); + enqueue_kernel(ckFinalize, w, h); return true; } |