diff options
Diffstat (limited to 'intern/cycles/device/opencl/opencl_base.cpp')
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 516 |
1 files changed, 489 insertions, 27 deletions
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 52d0662a8e3..509da7a0a84 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -20,6 +20,7 @@ #include "kernel/kernel_types.h" +#include "util/util_algorithm.h" #include "util/util_foreach.h" #include "util/util_logging.h" #include "util/util_md5.h" @@ -213,8 +214,24 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program.add_kernel(ustring("bake")); base_program.add_kernel(ustring("zero_buffer")); + denoising_program = OpenCLProgram(this, "denoising", "filter.cl", ""); + denoising_program.add_kernel(ustring("filter_divide_shadow")); + denoising_program.add_kernel(ustring("filter_get_feature")); + denoising_program.add_kernel(ustring("filter_detect_outliers")); + denoising_program.add_kernel(ustring("filter_combine_halves")); + denoising_program.add_kernel(ustring("filter_construct_transform")); + denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); + denoising_program.add_kernel(ustring("filter_nlm_blur")); + denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); + denoising_program.add_kernel(ustring("filter_nlm_update_output")); + denoising_program.add_kernel(ustring("filter_nlm_normalize")); + denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); + denoising_program.add_kernel(ustring("filter_finalize")); + denoising_program.add_kernel(ustring("filter_set_tiles")); + vector<OpenCLProgram*> programs; programs.push_back(&base_program); + programs.push_back(&denoising_program); /* Call actual class to fill the vector with its programs. */ if(!load_kernels(requested_features, programs)) { return false; @@ -260,6 +277,25 @@ void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryTyp size_t size = mem.memory_size(); + /* check there is enough memory available for the allocation */ + cl_ulong max_alloc_size = 0; + clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); + + if(DebugFlags().opencl.mem_limit) { + max_alloc_size = min(max_alloc_size, + cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); + } + + if(size > max_alloc_size) { + string error = "Scene too complex to fit in available memory."; + if(name != NULL) { + error += string_printf(" (allocating buffer %s failed.)", name); + } + set_error(error); + + return; + } + cl_mem_flags mem_flag; void *mem_ptr = NULL; @@ -322,37 +358,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in NULL, NULL)); } -void OpenCLDeviceBase::mem_zero(device_memory& mem) +void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size) { - if(mem.device_pointer) { - if(base_program.is_loaded()) { - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; - cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; + cl_mem d_buffer = CL_MEM_PTR(mem); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; - while(d_offset < mem.memory_size()) { - d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset); + while(d_offset < size) { + d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset); - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, - ckZeroBuffer, - 2, - NULL, - global_size, - NULL, - 0, - NULL, - NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - d_offset += d_size; - } + d_offset += d_size; + } +} + +void OpenCLDeviceBase::mem_zero(device_memory& mem) +{ + if(mem.device_pointer) { + if(base_program.is_loaded()) { + mem_zero_kernel(mem.device_pointer, mem.memory_size()); } if(mem.data_pointer) { @@ -396,6 +437,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem) } } +int OpenCLDeviceBase::mem_address_alignment() +{ + return OpenCLInfo::mem_address_alignment(cdDevice); +} + +device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type) +{ + cl_mem_flags mem_flag; + if(type == MEM_READ_ONLY) + mem_flag = CL_MEM_READ_ONLY; + else if(type == MEM_WRITE_ONLY) + mem_flag = CL_MEM_WRITE_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + cl_buffer_region info; + info.origin = mem.memory_elements_size(offset); + info.size = mem.memory_elements_size(size); + + device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), + mem_flag, + CL_BUFFER_CREATE_TYPE_REGION, + &info, + &ciErr); + opencl_assert_err(ciErr, "clCreateSubBuffer"); + return sub_buf; +} + +void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer) +{ + if(device_pointer && device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); + } +} + void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) { ConstMemMap::iterator i = const_mem_map.find(name); @@ -449,7 +525,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) return global_size + ((r == 0)? 0: group_size - r); } -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) +void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size) { size_t workgroup_size, max_work_items[3]; @@ -458,6 +534,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); + if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { + workgroup_size = max_workgroup_size; + } + /* Try to divide evenly over 2 dimensions. */ size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; @@ -543,6 +623,380 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); enqueue_kernel(ckFilmConvertKernel, d_w, d_h); } +bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) +{ + int4 rect = task->rect; + int w = rect.z-rect.x; + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr); + cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem out_mem = CL_MEM_PTR(out_ptr); + + mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h); + mem_zero_kernel(out_ptr, sizeof(float)*w*h); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); + cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, guide_mem, variance_mem, + difference, local_rect, w, 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference, blurDifference, local_rect, w, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, difference, local_rect, w, f); + kernel_set_args(ckNLMUpdateOutput, 0, + dx, dy, blurDifference, image_mem, + out_mem, weightAccum, local_rect, w, f); + + enqueue_kernel(ckNLMCalcDifference, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMCalcWeight, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMUpdateOutput, w, h); + } + + int4 local_rect = make_int4(0, 0, w, h); + kernel_set_args(ckNLMNormalize, 0, + out_mem, weightAccum, local_rect, w); + enqueue_kernel(ckNLMNormalize, w, h); + + return true; +} + +bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) +{ + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + + cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); + + kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + transform_mem, + rank_mem, + task->filter_area, + task->rect, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + + enqueue_kernel(ckFilterConstructTransform, + task->storage.w, + task->storage.h, + 256); + + return true; +} + +bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cl_mem color_mem = CL_MEM_PTR(color_ptr); + cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr); + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, + color_mem, + color_variance_mem, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + a, task->nlm_k_2); + enqueue_kernel(ckNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMCalcWeight, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + /* Reuse previous arguments. */ + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMConstructGramian, 0, + dx, dy, + blurDifference, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + local_rect, + task->reconstruction_state.filter_rect, + task->buffer.w, + task->buffer.h, + f, + task->buffer.pass_stride); + enqueue_kernel(ckNLMConstructGramian, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h, + 256); + } + + kernel_set_args(ckFinalize, 0, + task->buffer.w, + task->buffer.h, + output_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->filter_area, + task->reconstruction_state.buffer_params, + task->render_buffer.samples); + enqueue_kernel(ckFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + return true; +} + +bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); + + kernel_set_args(ckFilterCombineHalves, 0, + mean_mem, + variance_mem, + a_mem, + b_mem, + rect, + r); + enqueue_kernel(ckFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); + cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); + cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterDivideShadow, 0, + task->render_buffer.samples, + tiles_mem, + a_mem, + b_mem, + sample_variance_mem, + sv_variance_mem, + buffer_variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) +{ + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterGetFeature, 0, + task->render_buffer.samples, + tiles_mem, + mean_offset, + variance_offset, + mean_mem, + variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem depth_mem = CL_MEM_PTR(depth_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); + + kernel_set_args(ckFilterDetectOutliers, 0, + image_mem, + variance_mem, + depth_mem, + output_mem, + task->rect, + task->buffer.pass_stride); + enqueue_kernel(ckFilterDetectOutliers, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task) +{ + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE); + mem_copy_to(task->tiles_mem); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles")); + + kernel_set_args(ckFilterSetTiles, 0, tiles_mem); + for(int i = 0; i < 9; i++) { + cl_mem buffer_mem = CL_MEM_PTR(buffers[i]); + kernel_set_args(ckFilterSetTiles, i+1, buffer_mem); + } + + enqueue_kernel(ckFilterSetTiles, 1, 1); + + return true; +} + +void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task) +{ + DenoisingTask denoising(this); + + denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising); + denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); +} + void OpenCLDeviceBase::shader(DeviceTask& task) { /* cast arguments to cl types */ @@ -612,7 +1066,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task) string OpenCLDeviceBase::kernel_build_options(const string *debug_src) { - string build_options = "-cl-fast-relaxed-math "; + string build_options = "-cl-no-signed-zeros -cl-mad-enable "; if(platform_name == "NVIDIA CUDA") { build_options += "-D__KERNEL_OPENCL_NVIDIA__ " @@ -792,7 +1246,7 @@ void OpenCLDeviceBase::store_cached_kernel( } string OpenCLDeviceBase::build_options_for_base_program( - const DeviceRequestedFeatures& /*requested_features*/) + const DeviceRequestedFeatures& requested_features) { /* TODO(sergey): By default we compile all features, meaning * mega kernel is not getting feature-based optimizations. @@ -800,6 +1254,14 @@ string OpenCLDeviceBase::build_options_for_base_program( * Ideally we need always compile kernel with as less features * enabled as possible to keep performance at it's max. */ + + /* For now disable baking when not in use as this has major + * impact on kernel build times. + */ + if(!requested_features.use_baking) { + return "-D__NO_BAKING__"; + } + return ""; } |