diff options
Diffstat (limited to 'intern/cycles/device/opencl')
-rw-r--r-- | intern/cycles/device/opencl/opencl.h | 83 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 516 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_mega.cpp | 54 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_split.cpp | 123 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_util.cpp | 81 |
5 files changed, 765 insertions, 92 deletions
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 764216d0dfa..78ca377d933 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -17,6 +17,7 @@ #ifdef WITH_OPENCL #include "device/device.h" +#include "device/device_denoising.h" #include "util/util_map.h" #include "util/util_param.h" @@ -26,24 +27,24 @@ CCL_NAMESPACE_BEGIN +/* Disable workarounds, seems to be working fine on latest drivers. */ +#define CYCLES_DISABLE_DRIVER_WORKAROUNDS + /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */ #ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ # undef clEnqueueNDRangeKernel # define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ - clFinish(a); \ CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ clFinish(a); # undef clEnqueueWriteBuffer # define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ - clFinish(a); \ CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ clFinish(a); # undef clEnqueueReadBuffer # define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ - clFinish(a); \ CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ clFinish(a); #endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ @@ -86,7 +87,7 @@ public: string *error = NULL); static bool device_version_check(cl_device_id device, string *error = NULL); - static string get_hardware_id(string platform_name, + static string get_hardware_id(const string& platform_name, cl_device_id device_id); static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all = false); @@ -132,6 +133,13 @@ public: cl_int* error = NULL); static cl_device_type get_device_type(cl_device_id device_id); + static bool get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int* error = NULL); + + static int mem_address_alignment(cl_device_id device_id); + /* Get somewhat more readable device name. * Main difference is AMD OpenCL here which only gives code name * for the regular device name. This will give more sane device @@ -221,7 +229,7 @@ public: cl_int err = stmt; \ \ if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \ + string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -242,17 +250,17 @@ public: public: OpenCLProgram() : loaded(false), device(NULL) {} OpenCLProgram(OpenCLDeviceBase *device, - string program_name, - string kernel_name, - string kernel_build_options, + const string& program_name, + const string& kernel_name, + const string& kernel_build_options, bool use_stdout = true); ~OpenCLProgram(); void add_kernel(ustring name); void load(); - bool is_loaded() { return loaded; } - string get_log() { return log; } + bool is_loaded() const { return loaded; } + const string& get_log() const { return log; } void report_error(); cl_kernel operator()(); @@ -266,8 +274,8 @@ public: bool load_binary(const string& clbin, const string *debug_src = NULL); bool save_binary(const string& clbin); - void add_log(string msg, bool is_debug); - void add_error(string msg); + void add_log(const string& msg, bool is_debug); + void add_error(const string& msg); bool loaded; cl_program program; @@ -285,7 +293,7 @@ public: map<ustring, cl_kernel> kernels; }; - OpenCLProgram base_program; + OpenCLProgram base_program, denoising_program; typedef map<string, device_vector<uchar>*> ConstMemMap; typedef map<string, device_ptr> MemMap; @@ -323,6 +331,9 @@ public: void mem_copy_from(device_memory& mem, int y, int w, int h, int elem); void mem_zero(device_memory& mem); void mem_free(device_memory& mem); + + int mem_address_alignment(); + void const_copy_to(const char *name, void *host, size_t size); void tex_alloc(const char *name, device_memory& mem, @@ -331,12 +342,14 @@ public: void tex_free(device_memory& mem); size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, size_t w, size_t h); + void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1); void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask& task); + void denoise(RenderTile& tile, const DeviceTask& task); + class OpenCLDeviceTask : public DeviceTask { public: OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task) @@ -370,9 +383,51 @@ public: virtual void thread_run(DeviceTask * /*task*/) = 0; + virtual bool is_split_kernel() = 0; + protected: string kernel_build_options(const string *debug_src = NULL); + void mem_zero_kernel(device_ptr ptr, size_t size); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task); + bool denoising_construct_transform(DenoisingTask *task); + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task); + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task); + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task); + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task); + bool denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task); + + device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type); + void mem_free_sub_ptr(device_ptr ptr); + class ArgumentWrapper { public: ArgumentWrapper() : size(0), pointer(NULL) diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 52d0662a8e3..509da7a0a84 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -20,6 +20,7 @@ #include "kernel/kernel_types.h" +#include "util/util_algorithm.h" #include "util/util_foreach.h" #include "util/util_logging.h" #include "util/util_md5.h" @@ -213,8 +214,24 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program.add_kernel(ustring("bake")); base_program.add_kernel(ustring("zero_buffer")); + denoising_program = OpenCLProgram(this, "denoising", "filter.cl", ""); + denoising_program.add_kernel(ustring("filter_divide_shadow")); + denoising_program.add_kernel(ustring("filter_get_feature")); + denoising_program.add_kernel(ustring("filter_detect_outliers")); + denoising_program.add_kernel(ustring("filter_combine_halves")); + denoising_program.add_kernel(ustring("filter_construct_transform")); + denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); + denoising_program.add_kernel(ustring("filter_nlm_blur")); + denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); + denoising_program.add_kernel(ustring("filter_nlm_update_output")); + denoising_program.add_kernel(ustring("filter_nlm_normalize")); + denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); + denoising_program.add_kernel(ustring("filter_finalize")); + denoising_program.add_kernel(ustring("filter_set_tiles")); + vector<OpenCLProgram*> programs; programs.push_back(&base_program); + programs.push_back(&denoising_program); /* Call actual class to fill the vector with its programs. */ if(!load_kernels(requested_features, programs)) { return false; @@ -260,6 +277,25 @@ void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryTyp size_t size = mem.memory_size(); + /* check there is enough memory available for the allocation */ + cl_ulong max_alloc_size = 0; + clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); + + if(DebugFlags().opencl.mem_limit) { + max_alloc_size = min(max_alloc_size, + cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); + } + + if(size > max_alloc_size) { + string error = "Scene too complex to fit in available memory."; + if(name != NULL) { + error += string_printf(" (allocating buffer %s failed.)", name); + } + set_error(error); + + return; + } + cl_mem_flags mem_flag; void *mem_ptr = NULL; @@ -322,37 +358,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in NULL, NULL)); } -void OpenCLDeviceBase::mem_zero(device_memory& mem) +void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size) { - if(mem.device_pointer) { - if(base_program.is_loaded()) { - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; - cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; + cl_mem d_buffer = CL_MEM_PTR(mem); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; - while(d_offset < mem.memory_size()) { - d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset); + while(d_offset < size) { + d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset); - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, - ckZeroBuffer, - 2, - NULL, - global_size, - NULL, - 0, - NULL, - NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - d_offset += d_size; - } + d_offset += d_size; + } +} + +void OpenCLDeviceBase::mem_zero(device_memory& mem) +{ + if(mem.device_pointer) { + if(base_program.is_loaded()) { + mem_zero_kernel(mem.device_pointer, mem.memory_size()); } if(mem.data_pointer) { @@ -396,6 +437,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem) } } +int OpenCLDeviceBase::mem_address_alignment() +{ + return OpenCLInfo::mem_address_alignment(cdDevice); +} + +device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type) +{ + cl_mem_flags mem_flag; + if(type == MEM_READ_ONLY) + mem_flag = CL_MEM_READ_ONLY; + else if(type == MEM_WRITE_ONLY) + mem_flag = CL_MEM_WRITE_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + cl_buffer_region info; + info.origin = mem.memory_elements_size(offset); + info.size = mem.memory_elements_size(size); + + device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), + mem_flag, + CL_BUFFER_CREATE_TYPE_REGION, + &info, + &ciErr); + opencl_assert_err(ciErr, "clCreateSubBuffer"); + return sub_buf; +} + +void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer) +{ + if(device_pointer && device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); + } +} + void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) { ConstMemMap::iterator i = const_mem_map.find(name); @@ -449,7 +525,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) return global_size + ((r == 0)? 0: group_size - r); } -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) +void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size) { size_t workgroup_size, max_work_items[3]; @@ -458,6 +534,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); + if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { + workgroup_size = max_workgroup_size; + } + /* Try to divide evenly over 2 dimensions. */ size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; @@ -543,6 +623,380 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); enqueue_kernel(ckFilmConvertKernel, d_w, d_h); } +bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) +{ + int4 rect = task->rect; + int w = rect.z-rect.x; + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr); + cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem out_mem = CL_MEM_PTR(out_ptr); + + mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h); + mem_zero_kernel(out_ptr, sizeof(float)*w*h); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); + cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, guide_mem, variance_mem, + difference, local_rect, w, 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference, blurDifference, local_rect, w, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, difference, local_rect, w, f); + kernel_set_args(ckNLMUpdateOutput, 0, + dx, dy, blurDifference, image_mem, + out_mem, weightAccum, local_rect, w, f); + + enqueue_kernel(ckNLMCalcDifference, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMCalcWeight, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMUpdateOutput, w, h); + } + + int4 local_rect = make_int4(0, 0, w, h); + kernel_set_args(ckNLMNormalize, 0, + out_mem, weightAccum, local_rect, w); + enqueue_kernel(ckNLMNormalize, w, h); + + return true; +} + +bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) +{ + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + + cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); + + kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + transform_mem, + rank_mem, + task->filter_area, + task->rect, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + + enqueue_kernel(ckFilterConstructTransform, + task->storage.w, + task->storage.h, + 256); + + return true; +} + +bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cl_mem color_mem = CL_MEM_PTR(color_ptr); + cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr); + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, + color_mem, + color_variance_mem, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + a, task->nlm_k_2); + enqueue_kernel(ckNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMCalcWeight, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + /* Reuse previous arguments. */ + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMConstructGramian, 0, + dx, dy, + blurDifference, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + local_rect, + task->reconstruction_state.filter_rect, + task->buffer.w, + task->buffer.h, + f, + task->buffer.pass_stride); + enqueue_kernel(ckNLMConstructGramian, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h, + 256); + } + + kernel_set_args(ckFinalize, 0, + task->buffer.w, + task->buffer.h, + output_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->filter_area, + task->reconstruction_state.buffer_params, + task->render_buffer.samples); + enqueue_kernel(ckFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + return true; +} + +bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); + + kernel_set_args(ckFilterCombineHalves, 0, + mean_mem, + variance_mem, + a_mem, + b_mem, + rect, + r); + enqueue_kernel(ckFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); + cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); + cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterDivideShadow, 0, + task->render_buffer.samples, + tiles_mem, + a_mem, + b_mem, + sample_variance_mem, + sv_variance_mem, + buffer_variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) +{ + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterGetFeature, 0, + task->render_buffer.samples, + tiles_mem, + mean_offset, + variance_offset, + mean_mem, + variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem depth_mem = CL_MEM_PTR(depth_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); + + kernel_set_args(ckFilterDetectOutliers, 0, + image_mem, + variance_mem, + depth_mem, + output_mem, + task->rect, + task->buffer.pass_stride); + enqueue_kernel(ckFilterDetectOutliers, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task) +{ + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE); + mem_copy_to(task->tiles_mem); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles")); + + kernel_set_args(ckFilterSetTiles, 0, tiles_mem); + for(int i = 0; i < 9; i++) { + cl_mem buffer_mem = CL_MEM_PTR(buffers[i]); + kernel_set_args(ckFilterSetTiles, i+1, buffer_mem); + } + + enqueue_kernel(ckFilterSetTiles, 1, 1); + + return true; +} + +void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task) +{ + DenoisingTask denoising(this); + + denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising); + denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); +} + void OpenCLDeviceBase::shader(DeviceTask& task) { /* cast arguments to cl types */ @@ -612,7 +1066,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task) string OpenCLDeviceBase::kernel_build_options(const string *debug_src) { - string build_options = "-cl-fast-relaxed-math "; + string build_options = "-cl-no-signed-zeros -cl-mad-enable "; if(platform_name == "NVIDIA CUDA") { build_options += "-D__KERNEL_OPENCL_NVIDIA__ " @@ -792,7 +1246,7 @@ void OpenCLDeviceBase::store_cached_kernel( } string OpenCLDeviceBase::build_options_for_base_program( - const DeviceRequestedFeatures& /*requested_features*/) + const DeviceRequestedFeatures& requested_features) { /* TODO(sergey): By default we compile all features, meaning * mega kernel is not getting feature-based optimizations. @@ -800,6 +1254,14 @@ string OpenCLDeviceBase::build_options_for_base_program( * Ideally we need always compile kernel with as less features * enabled as possible to keep performance at it's max. */ + + /* For now disable baking when not in use as this has major + * impact on kernel build times. + */ + if(!requested_features.use_baking) { + return "-D__NO_BAKING__"; + } + return ""; } diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index a2fd1d71156..06c15bcf401 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -108,41 +108,53 @@ public: else if(task->type == DeviceTask::SHADER) { shader(*task); } - else if(task->type == DeviceTask::PATH_TRACE) { + else if(task->type == DeviceTask::RENDER) { RenderTile tile; /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + if(tile.task == RenderTile::PATH_TRACE) { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } + + path_trace(tile, sample); - path_trace(tile, sample); + tile.sample = sample + 1; - tile.sample = sample + 1; + task->update_progress(&tile, tile.w*tile.h); + } + /* Complete kernel execution before release tile */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, *task); task->update_progress(&tile, tile.w*tile.h); } - /* Complete kernel execution before release tile */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - task->release_tile(tile); } } } + + bool is_split_kernel() + { + return false; + } }; Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background) diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index b8df57ec7b9..76d9983e9a2 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -25,6 +25,7 @@ #include "device/device_split_kernel.h" +#include "util/util_algorithm.h" #include "util/util_logging.h" #include "util/util_md5.h" #include "util/util_path.h" @@ -70,6 +71,10 @@ public: delete split_kernel; } + virtual bool show_samples() const { + return true; + } + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, vector<OpenCLDeviceBase::OpenCLProgram*> &programs) { @@ -100,7 +105,7 @@ public: else if(task->type == DeviceTask::SHADER) { shader(*task); } - else if(task->type == DeviceTask::PATH_TRACE) { + else if(task->type == DeviceTask::RENDER) { RenderTile tile; /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to @@ -123,21 +128,29 @@ public: /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { - split_kernel->path_trace(task, - tile, - kgbuffer, - *const_mem_map["__data"]); - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); + if(tile.task == RenderTile::PATH_TRACE) { + assert(tile.task == RenderTile::PATH_TRACE); + split_kernel->path_trace(task, + tile, + kgbuffer, + *const_mem_map["__data"]); + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, *task); + task->update_progress(&tile, tile.w*tile.h); + } task->release_tile(tile); } @@ -146,6 +159,11 @@ public: } } + bool is_split_kernel() + { + return true; + } + protected: /* ** Those guys are for workign around some compiler-specific bugs ** */ @@ -159,17 +177,62 @@ protected: friend class OpenCLSplitKernelFunction; }; +struct CachedSplitMemory { + int id; + device_memory *split_data; + device_memory *ray_state; + device_ptr *rng_state; + device_memory *queue_index; + device_memory *use_queues_flag; + device_memory *work_pools; + device_ptr *buffer; +}; + class OpenCLSplitKernelFunction : public SplitKernelFunction { public: OpenCLDeviceSplitKernel* device; OpenCLDeviceBase::OpenCLProgram program; + CachedSplitMemory& cached_memory; + int cached_id; + + OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) : + device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1) + { + } - OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {} - ~OpenCLSplitKernelFunction() { program.release(); } + ~OpenCLSplitKernelFunction() + { + program.release(); + } virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) { - device->kernel_set_args(program(), 0, kg, data); + if(cached_id != cached_memory.id) { + cl_uint start_arg_index = + device->kernel_set_args(program(), + 0, + kg, + data, + *cached_memory.split_data, + *cached_memory.ray_state, + *cached_memory.rng_state); + +/* TODO(sergey): Avoid map lookup here. */ +#define KERNEL_TEX(type, ttype, name) \ + device->set_kernel_arg_mem(program(), &start_arg_index, #name); +#include "kernel/kernel_textures.h" +#undef KERNEL_TEX + + start_arg_index += + device->kernel_set_args(program(), + start_arg_index, + *cached_memory.queue_index, + *cached_memory.use_queues_flag, + *cached_memory.work_pools, + *cached_memory.buffer); + + cached_id = cached_memory.id; + } device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, program(), @@ -196,14 +259,15 @@ public: class OpenCLSplitKernel : public DeviceSplitKernel { OpenCLDeviceSplitKernel *device; + CachedSplitMemory cached_memory; public: explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) { } - virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, const DeviceRequestedFeatures& requested_features) { - OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device); + OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory); bool single_program = OpenCLInfo::use_single_program(); kernel->program = @@ -332,6 +396,15 @@ public: return false; } + cached_memory.split_data = &split_data; + cached_memory.ray_state = &ray_state; + cached_memory.rng_state = &rtile.rng_state; + cached_memory.queue_index = &queue_index; + cached_memory.use_queues_flag = &use_queues_flag; + cached_memory.work_pools = &work_pool_wgs; + cached_memory.buffer = &rtile.buffer; + cached_memory.id++; + return true; } @@ -351,12 +424,18 @@ public: cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if(DebugFlags().opencl.mem_limit) { + max_buffer_size = min(max_buffer_size, + cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); + } + VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2); - int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements)); + int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; } diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index fe1c65a2224..0d34af3e040 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -241,9 +241,9 @@ string OpenCLCache::get_kernel_md5() } OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device, - string program_name, - string kernel_file, - string kernel_build_options, + const string& program_name, + const string& kernel_file, + const string& kernel_build_options, bool use_stdout) : device(device), program_name(program_name), @@ -274,7 +274,7 @@ void OpenCLDeviceBase::OpenCLProgram::release() } } -void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug) +void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug) { if(!use_stdout) { log += msg + "\n"; @@ -288,7 +288,7 @@ void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug) } } -void OpenCLDeviceBase::OpenCLProgram::add_error(string msg) +void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg) { if(use_stdout) { fprintf(stderr, "%s\n", msg.c_str()); @@ -608,6 +608,14 @@ bool OpenCLInfo::device_supported(const string& platform_name, if(!get_device_name(device_id, &device_name)) { return false; } + + int driver_major = 0; + int driver_minor = 0; + if(!get_driver_version(device_id, &driver_major, &driver_minor)) { + return false; + } + VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; + /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework * (aka, it will not be on Intel framework). This isn't supported * and needs an explicit blacklist. @@ -618,6 +626,21 @@ bool OpenCLInfo::device_supported(const string& platform_name, if(platform_name == "AMD Accelerated Parallel Processing" && device_type == CL_DEVICE_TYPE_GPU) { + if(driver_major < 2236) { + VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; + return false; + } + const char *blacklist[] = { + /* GCN 1 */ + "Tahiti", "Pitcairn", "Capeverde", "Oland", + NULL + }; + for (int i = 0; blacklist[i] != NULL; i++) { + if(device_name == blacklist[i]) { + VLOG(1) << "AMD device " << device_name << " not supported"; + return false; + } + } return true; } if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { @@ -684,7 +707,7 @@ bool OpenCLInfo::device_version_check(cl_device_id device, return true; } -string OpenCLInfo::get_hardware_id(string platform_name, cl_device_id device_id) +string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id) { if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { /* Use cl_amd_device_topology extension. */ @@ -902,7 +925,7 @@ bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string OpenCLInfo::get_platform_name(cl_platform_id platform_id) { string platform_name; - if (!get_platform_name(platform_id, &platform_name)) { + if(!get_platform_name(platform_id, &platform_name)) { return ""; } return platform_name; @@ -1063,7 +1086,7 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id) CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, - &length) == CL_SUCCESS) + &length) == CL_SUCCESS) { if(length != 0 && board_name[0] != '\0') { return board_name; @@ -1073,6 +1096,48 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id) return get_device_name(device_id); } +bool OpenCLInfo::get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int* error) +{ + char buffer[1024]; + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DRIVER_VERSION, + sizeof(buffer), + &buffer, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + if(sscanf(buffer, "%d.%d", major, minor) < 2) { + VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); + return false; + } + return true; +} + +int OpenCLInfo::mem_address_alignment(cl_device_id device_id) +{ + int base_align_bits; + if(clGetDeviceInfo(device_id, + CL_DEVICE_MEM_BASE_ADDR_ALIGN, + sizeof(int), + &base_align_bits, + NULL) == CL_SUCCESS) + { + return base_align_bits/8; + } + return 1; +} + CCL_NAMESPACE_END #endif |