diff options
Diffstat (limited to 'intern/cycles/device/opencl')
-rw-r--r-- | intern/cycles/device/opencl/memory_manager.cpp | 253 | ||||
-rw-r--r-- | intern/cycles/device/opencl/memory_manager.h | 105 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl.h | 77 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 192 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_mega.cpp | 5 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_split.cpp | 89 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_util.cpp | 65 |
7 files changed, 723 insertions, 63 deletions
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp new file mode 100644 index 00000000000..b67dfef88aa --- /dev/null +++ b/intern/cycles/device/opencl/memory_manager.cpp @@ -0,0 +1,253 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_OPENCL + +#include "util/util_foreach.h" + +#include "device/opencl/opencl.h" +#include "device/opencl/memory_manager.h" + +CCL_NAMESPACE_BEGIN + +void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation) +{ + allocations.push_back(&allocation); +} + +void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device) +{ + bool need_realloc = false; + + /* Calculate total size and remove any freed. */ + size_t total_size = 0; + + for(int i = allocations.size()-1; i >= 0; i--) { + Allocation* allocation = allocations[i]; + + /* Remove allocations that have been freed. */ + if(!allocation->mem || allocation->mem->memory_size() == 0) { + allocation->device_buffer = NULL; + allocation->size = 0; + + allocations.erase(allocations.begin()+i); + + need_realloc = true; + + continue; + } + + /* Get actual size for allocation. */ + size_t alloc_size = align_up(allocation->mem->memory_size(), 16); + + if(allocation->size != alloc_size) { + /* Allocation is either new or resized. */ + allocation->size = alloc_size; + allocation->needs_copy_to_device = true; + + need_realloc = true; + } + + total_size += alloc_size; + } + + if(need_realloc) { + cl_ulong max_buffer_size; + clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if(total_size > max_buffer_size) { + device->set_error("Scene too complex to fit in available memory."); + return; + } + + device_memory *new_buffer = new device_memory; + + new_buffer->resize(total_size); + device->mem_alloc(string_printf("buffer_%p", this).data(), *new_buffer, MEM_READ_ONLY); + + size_t offset = 0; + + foreach(Allocation* allocation, allocations) { + if(allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(new_buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + (void*)allocation->mem->data_pointer, + 0, NULL, NULL + )); + + allocation->needs_copy_to_device = false; + } + else { + /* Fast copy from memory already on device. */ + opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_MEM_PTR(new_buffer->device_pointer), + allocation->desc.offset, + offset, + allocation->mem->memory_size(), + 0, NULL, NULL + )); + } + + allocation->desc.offset = offset; + offset += allocation->size; + } + + device->mem_free(*buffer); + delete buffer; + + buffer = new_buffer; + } + else { + assert(total_size == buffer->data_size); + + size_t offset = 0; + + foreach(Allocation* allocation, allocations) { + if(allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + (void*)allocation->mem->data_pointer, + 0, NULL, NULL + )); + + allocation->needs_copy_to_device = false; + } + + offset += allocation->size; + } + } + + /* Not really necessary, but seems to improve responsiveness for some reason. */ + clFinish(device->cqCommandQueue); +} + +void MemoryManager::DeviceBuffer::free(OpenCLDeviceBase *device) +{ + device->mem_free(*buffer); +} + +MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer() +{ + DeviceBuffer* smallest = device_buffers; + + foreach(DeviceBuffer& device_buffer, device_buffers) { + if(device_buffer.size < smallest->size) { + smallest = &device_buffer; + } + } + + return smallest; +} + +MemoryManager::MemoryManager(OpenCLDeviceBase *device) : device(device), need_update(false) +{ +} + +void MemoryManager::free() +{ + foreach(DeviceBuffer& device_buffer, device_buffers) { + device_buffer.free(device); + } +} + +void MemoryManager::alloc(const char *name, device_memory& mem) +{ + Allocation& allocation = allocations[name]; + + allocation.mem = &mem; + allocation.needs_copy_to_device = true; + + if(!allocation.device_buffer) { + DeviceBuffer* device_buffer = smallest_device_buffer(); + allocation.device_buffer = device_buffer; + + allocation.desc.device_buffer = device_buffer - device_buffers; + + device_buffer->add_allocation(allocation); + + device_buffer->size += mem.memory_size(); + } + + need_update = true; +} + +bool MemoryManager::free(device_memory& mem) +{ + foreach(AllocationsMap::value_type& value, allocations) { + Allocation& allocation = value.second; + if(allocation.mem == &mem) { + + allocation.device_buffer->size -= mem.memory_size(); + + allocation.mem = NULL; + allocation.needs_copy_to_device = false; + + need_update = true; + return true; + } + } + + return false; +} + +MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name) +{ + update_device_memory(); + + Allocation& allocation = allocations[name]; + return allocation.desc; +} + +void MemoryManager::update_device_memory() +{ + if(!need_update) { + return; + } + + need_update = false; + + foreach(DeviceBuffer& device_buffer, device_buffers) { + device_buffer.update_device_memory(device); + } +} + +void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) +{ + update_device_memory(); + + foreach(DeviceBuffer& device_buffer, device_buffers) { + if(device_buffer.buffer->device_pointer) { + device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer); + } + else { + device->kernel_set_args(kernel, (*narg)++, device->null_mem); + } + } +} + +CCL_NAMESPACE_END + +#endif /* WITH_OPENCL */ + diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h new file mode 100644 index 00000000000..3714405d026 --- /dev/null +++ b/intern/cycles/device/opencl/memory_manager.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device.h" + +#include "util/util_map.h" +#include "util/util_vector.h" +#include "util/util_string.h" + +#include "clew.h" + +CCL_NAMESPACE_BEGIN + +class OpenCLDeviceBase; + +class MemoryManager { +public: + static const int NUM_DEVICE_BUFFERS = 8; + + struct BufferDescriptor { + uint device_buffer; + cl_ulong offset; + }; + +private: + struct DeviceBuffer; + + struct Allocation { + device_memory *mem; + + DeviceBuffer *device_buffer; + size_t size; /* Size of actual allocation, may be larger than requested. */ + + BufferDescriptor desc; + + bool needs_copy_to_device; + + Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false) + { + } + }; + + struct DeviceBuffer { + device_memory *buffer; + vector<Allocation*> allocations; + size_t size; /* Size of all allocations. */ + + DeviceBuffer() : buffer(new device_memory), size(0) + { + } + + ~DeviceBuffer() { + delete buffer; + buffer = NULL; + } + + void add_allocation(Allocation& allocation); + + void update_device_memory(OpenCLDeviceBase *device); + + void free(OpenCLDeviceBase *device); + }; + + OpenCLDeviceBase *device; + + DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; + + typedef unordered_map<string, Allocation> AllocationsMap; + AllocationsMap allocations; + + bool need_update; + + DeviceBuffer* smallest_device_buffer(); + +public: + MemoryManager(OpenCLDeviceBase *device); + + void free(); /* Free all memory. */ + + void alloc(const char *name, device_memory& mem); + bool free(device_memory& mem); + + BufferDescriptor get_descriptor(string name); + + void update_device_memory(); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); +}; + +CCL_NAMESPACE_END + diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 27e196d1e68..26bf4a9af5b 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -25,8 +25,13 @@ #include "clew.h" +#include "device/opencl/memory_manager.h" + CCL_NAMESPACE_BEGIN +/* Disable workarounds, seems to be working fine on latest drivers. */ +#define CYCLES_DISABLE_DRIVER_WORKAROUNDS + /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */ #ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ @@ -84,7 +89,7 @@ public: string *error = NULL); static bool device_version_check(cl_device_id device, string *error = NULL); - static string get_hardware_id(string platform_name, + static string get_hardware_id(const string& platform_name, cl_device_id device_id); static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all = false); @@ -130,6 +135,11 @@ public: cl_int* error = NULL); static cl_device_type get_device_type(cl_device_id device_id); + static bool get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int* error = NULL); + static int mem_address_alignment(cl_device_id device_id); /* Get somewhat more readable device name. @@ -216,6 +226,18 @@ public: static string get_kernel_md5(); }; +#define opencl_device_assert(device, stmt) \ + { \ + cl_int err = stmt; \ + \ + if(err != CL_SUCCESS) { \ + string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ + if((device)->error_message() == "") \ + (device)->set_error(message); \ + fprintf(stderr, "%s\n", message.c_str()); \ + } \ + } (void)0 + #define opencl_assert(stmt) \ { \ cl_int err = stmt; \ @@ -242,17 +264,17 @@ public: public: OpenCLProgram() : loaded(false), device(NULL) {} OpenCLProgram(OpenCLDeviceBase *device, - string program_name, - string kernel_name, - string kernel_build_options, + const string& program_name, + const string& kernel_name, + const string& kernel_build_options, bool use_stdout = true); ~OpenCLProgram(); void add_kernel(ustring name); void load(); - bool is_loaded() { return loaded; } - string get_log() { return log; } + bool is_loaded() const { return loaded; } + const string& get_log() const { return log; } void report_error(); cl_kernel operator()(); @@ -266,8 +288,8 @@ public: bool load_binary(const string& clbin, const string *debug_src = NULL); bool save_binary(const string& clbin); - void add_log(string msg, bool is_debug); - void add_error(string msg); + void add_log(const string& msg, bool is_debug); + void add_error(const string& msg); bool loaded; cl_program program; @@ -336,6 +358,7 @@ public: size_t global_size_round_up(int group_size, int global_size); void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1); void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask& task); @@ -390,8 +413,6 @@ protected: bool denoising_construct_transform(DenoisingTask *task); bool denoising_reconstruct(device_ptr color_ptr, device_ptr color_variance_ptr, - device_ptr guide_ptr, - device_ptr guide_variance_ptr, device_ptr output_ptr, DenoisingTask *task); bool denoising_combine_halves(device_ptr a_ptr, @@ -519,6 +540,42 @@ protected: virtual string build_options_for_base_program( const DeviceRequestedFeatures& /*requested_features*/); + +private: + MemoryManager memory_manager; + friend class MemoryManager; + + struct tex_info_t { + uint buffer, padding; + cl_ulong offset; + uint width, height, depth, options; + }; + static_assert_align(tex_info_t, 16); + + vector<tex_info_t> texture_descriptors; + device_memory texture_descriptors_buffer; + + struct Texture { + Texture() {} + Texture(device_memory* mem, + InterpolationType interpolation, + ExtensionType extension) + : mem(mem), + interpolation(interpolation), + extension(extension) { + } + device_memory* mem; + InterpolationType interpolation; + ExtensionType extension; + }; + + typedef map<string, Texture> TexturesMap; + TexturesMap textures; + + bool textures_need_update; + +protected: + void flush_texture_buffers(); }; Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 24b70e3446c..7bdf81462b8 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -20,6 +20,7 @@ #include "kernel/kernel_types.h" +#include "util/util_algorithm.h" #include "util/util_foreach.h" #include "util/util_logging.h" #include "util/util_md5.h" @@ -28,6 +29,15 @@ CCL_NAMESPACE_BEGIN +struct texture_slot_t { + texture_slot_t(const string& name, int slot) + : name(name), + slot(slot) { + } + string name; + int slot; +}; + bool OpenCLDeviceBase::opencl_error(cl_int err) { if(err != CL_SUCCESS) { @@ -62,7 +72,7 @@ void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where) } OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_) -: Device(info, stats, background_) +: Device(info, stats, background_), memory_manager(this) { cpPlatform = NULL; cdDevice = NULL; @@ -70,6 +80,7 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou cqCommandQueue = NULL; null_mem = 0; device_initialized = false; + textures_need_update = true; vector<OpenCLPlatformDevice> usable_devices; OpenCLInfo::get_usable_devices(&usable_devices); @@ -125,6 +136,12 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou return; } + /* Allocate this right away so that texture_descriptors_buffer is placed at offset 0 in the device memory buffers */ + texture_descriptors.resize(1); + texture_descriptors_buffer.resize(1); + texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0]; + memory_manager.alloc("texture_descriptors", texture_descriptors_buffer); + fprintf(stderr, "Device init success\n"); device_initialized = true; } @@ -133,6 +150,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase() { task_pool.stop(); + memory_manager.free(); + if(null_mem) clReleaseMemObject(CL_MEM_PTR(null_mem)); @@ -276,6 +295,25 @@ void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryTyp size_t size = mem.memory_size(); + /* check there is enough memory available for the allocation */ + cl_ulong max_alloc_size = 0; + clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); + + if(DebugFlags().opencl.mem_limit) { + max_alloc_size = min(max_alloc_size, + cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); + } + + if(size > max_alloc_size) { + string error = "Scene too complex to fit in available memory."; + if(name != NULL) { + error += string_printf(" (allocating buffer %s failed.)", name); + } + set_error(error); + + return; + } + cl_mem_flags mem_flag; void *mem_ptr = NULL; @@ -473,29 +511,35 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) void OpenCLDeviceBase::tex_alloc(const char *name, device_memory& mem, - InterpolationType /*interpolation*/, - ExtensionType /*extension*/) + InterpolationType interpolation, + ExtensionType extension) { VLOG(1) << "Texture allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; - mem_alloc(NULL, mem, MEM_READ_ONLY); - mem_copy_to(mem); - assert(mem_map.find(name) == mem_map.end()); - mem_map.insert(MemMap::value_type(name, mem.device_pointer)); + + memory_manager.alloc(name, mem); + /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ + mem.device_pointer = 1; + textures[name] = Texture(&mem, interpolation, extension); + textures_need_update = true; } void OpenCLDeviceBase::tex_free(device_memory& mem) { if(mem.device_pointer) { - foreach(const MemMap::value_type& value, mem_map) { - if(value.second == mem.device_pointer) { - mem_map.erase(value.first); + mem.device_pointer = 0; + + if(memory_manager.free(mem)) { + textures_need_update = true; + } + + foreach(TexturesMap::value_type& value, textures) { + if(value.second.mem == &mem) { + textures.erase(value.first); break; } } - - mem_free(mem); } } @@ -561,6 +605,98 @@ void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr)); } +void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) +{ + flush_texture_buffers(); + + memory_manager.set_kernel_arg_buffers(kernel, narg); +} + +void OpenCLDeviceBase::flush_texture_buffers() +{ + if(!textures_need_update) { + return; + } + textures_need_update = false; + + /* Setup slots for textures. */ + int num_slots = 0; + + vector<texture_slot_t> texture_slots; + +#define KERNEL_TEX(type, ttype, name) \ + if(textures.find(#name) != textures.end()) { \ + texture_slots.push_back(texture_slot_t(#name, num_slots)); \ + } \ + num_slots++; +#include "kernel/kernel_textures.h" + + int num_data_slots = num_slots; + + foreach(TexturesMap::value_type& tex, textures) { + string name = tex.first; + + if(string_startswith(name, "__tex_image")) { + int pos = name.rfind("_"); + int id = atoi(name.data() + pos + 1); + texture_slots.push_back(texture_slot_t(name, + num_data_slots + id)); + num_slots = max(num_slots, num_data_slots + id + 1); + } + } + + /* Realloc texture descriptors buffer. */ + memory_manager.free(texture_descriptors_buffer); + + texture_descriptors.resize(num_slots); + texture_descriptors_buffer.resize(num_slots * sizeof(tex_info_t)); + texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0]; + + memory_manager.alloc("texture_descriptors", texture_descriptors_buffer); + + /* Fill in descriptors */ + foreach(texture_slot_t& slot, texture_slots) { + Texture& tex = textures[slot.name]; + + tex_info_t& info = texture_descriptors[slot.slot]; + + MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); + + info.offset = desc.offset; + info.buffer = desc.device_buffer; + + if(string_startswith(slot.name, "__tex_image")) { + info.width = tex.mem->data_width; + info.height = tex.mem->data_height; + info.depth = tex.mem->data_depth; + + info.options = 0; + + if(tex.interpolation == INTERPOLATION_CLOSEST) { + info.options |= (1 << 0); + } + + switch(tex.extension) { + case EXTENSION_REPEAT: + info.options |= (1 << 1); + break; + case EXTENSION_EXTEND: + info.options |= (1 << 2); + break; + case EXTENSION_CLIP: + info.options |= (1 << 3); + break; + default: + break; + } + } + } + + /* Force write of descriptors. */ + memory_manager.free(texture_descriptors_buffer); + memory_manager.alloc("texture_descriptors", texture_descriptors_buffer); +} + void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) { /* cast arguments to cl types */ @@ -585,10 +721,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ d_rgba, d_buffer); -#define KERNEL_TEX(type, ttype, name) \ -set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); -#include "kernel/kernel_textures.h" -#undef KERNEL_TEX + set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); start_arg_index += kernel_set_args(ckFilmConvertKernel, start_arg_index, @@ -693,8 +826,6 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, device_ptr color_variance_ptr, - device_ptr guide_ptr, - device_ptr guide_variance_ptr, device_ptr output_ptr, DenoisingTask *task) { @@ -703,8 +834,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, cl_mem color_mem = CL_MEM_PTR(color_ptr); cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); - cl_mem guide_mem = CL_MEM_PTR(guide_ptr); - cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr); cl_mem output_mem = CL_MEM_PTR(output_ptr); cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); @@ -735,8 +864,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, kernel_set_args(ckNLMCalcDifference, 0, dx, dy, - guide_mem, - guide_variance_mem, + color_mem, + color_variance_mem, difference, local_rect, task->buffer.w, @@ -775,8 +904,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, dx, dy, blurDifference, buffer_mem, - color_mem, - color_variance_mem, transform_mem, rank_mem, XtWX_mem, @@ -961,7 +1088,7 @@ void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task) denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising); denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); - denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising); denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); @@ -1016,10 +1143,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task) d_output_luma); } -#define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(kernel, &start_arg_index, #name); -#include "kernel/kernel_textures.h" -#undef KERNEL_TEX + set_kernel_arg_buffers(kernel, &start_arg_index); start_arg_index += kernel_set_args(kernel, start_arg_index, @@ -1232,7 +1356,7 @@ void OpenCLDeviceBase::store_cached_kernel( } string OpenCLDeviceBase::build_options_for_base_program( - const DeviceRequestedFeatures& /*requested_features*/) + const DeviceRequestedFeatures& requested_features) { /* TODO(sergey): By default we compile all features, meaning * mega kernel is not getting feature-based optimizations. @@ -1240,6 +1364,14 @@ string OpenCLDeviceBase::build_options_for_base_program( * Ideally we need always compile kernel with as less features * enabled as possible to keep performance at it's max. */ + + /* For now disable baking when not in use as this has major + * impact on kernel build times. + */ + if(!requested_features.use_baking) { + return "-D__NO_BAKING__"; + } + return ""; } diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index 06c15bcf401..ec47fdafa3d 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -82,10 +82,7 @@ public: d_buffer, d_rng_state); -#define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name); -#include "kernel/kernel_textures.h" -#undef KERNEL_TEX + set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index); start_arg_index += kernel_set_args(ckPathTraceKernel, start_arg_index, diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 76dcbd6fc9a..16a96213100 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -25,6 +25,7 @@ #include "device/device_split_kernel.h" +#include "util/util_algorithm.h" #include "util/util_logging.h" #include "util/util_md5.h" #include "util/util_path.h" @@ -98,6 +99,8 @@ public: void thread_run(DeviceTask *task) { + flush_texture_buffers(); + if(task->type == DeviceTask::FILM_CONVERT) { film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); } @@ -112,10 +115,19 @@ public: */ typedef struct KernelGlobals { ccl_constant KernelData *data; + ccl_global char *buffers[8]; + + typedef struct _tex_info_t { + uint buffer, padding; + uint64_t offset; + uint width, height, depth, options; + } _tex_info_t; + #define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name; + _tex_info_t name; #include "kernel/kernel_textures.h" #undef KERNEL_TEX + SplitData split_data; SplitParams split_param_data; } KernelGlobals; @@ -176,17 +188,58 @@ protected: friend class OpenCLSplitKernelFunction; }; +struct CachedSplitMemory { + int id; + device_memory *split_data; + device_memory *ray_state; + device_ptr *rng_state; + device_memory *queue_index; + device_memory *use_queues_flag; + device_memory *work_pools; + device_ptr *buffer; +}; + class OpenCLSplitKernelFunction : public SplitKernelFunction { public: OpenCLDeviceSplitKernel* device; OpenCLDeviceBase::OpenCLProgram program; + CachedSplitMemory& cached_memory; + int cached_id; - OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {} - ~OpenCLSplitKernelFunction() { program.release(); } + OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) : + device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1) + { + } + + ~OpenCLSplitKernelFunction() + { + program.release(); + } virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) { - device->kernel_set_args(program(), 0, kg, data); + if(cached_id != cached_memory.id) { + cl_uint start_arg_index = + device->kernel_set_args(program(), + 0, + kg, + data, + *cached_memory.split_data, + *cached_memory.ray_state, + *cached_memory.rng_state); + + device->set_kernel_arg_buffers(program(), &start_arg_index); + + start_arg_index += + device->kernel_set_args(program(), + start_arg_index, + *cached_memory.queue_index, + *cached_memory.use_queues_flag, + *cached_memory.work_pools, + *cached_memory.buffer); + + cached_id = cached_memory.id; + } device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, program(), @@ -213,14 +266,15 @@ public: class OpenCLSplitKernel : public DeviceSplitKernel { OpenCLDeviceSplitKernel *device; + CachedSplitMemory cached_memory; public: explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) { } - virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, const DeviceRequestedFeatures& requested_features) { - OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device); + OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory); bool single_program = OpenCLInfo::use_single_program(); kernel->program = @@ -305,11 +359,7 @@ public: ray_state, rtile.rng_state); -/* TODO(sergey): Avoid map lookup here. */ -#define KERNEL_TEX(type, ttype, name) \ - device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name); -#include "kernel/kernel_textures.h" -#undef KERNEL_TEX + device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index); start_arg_index += device->kernel_set_args(device->program_data_init(), @@ -349,6 +399,15 @@ public: return false; } + cached_memory.split_data = &split_data; + cached_memory.ray_state = &ray_state; + cached_memory.rng_state = &rtile.rng_state; + cached_memory.queue_index = &queue_index; + cached_memory.use_queues_flag = &use_queues_flag; + cached_memory.work_pools = &work_pool_wgs; + cached_memory.buffer = &rtile.buffer; + cached_memory.id++; + return true; } @@ -368,12 +427,18 @@ public: cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if(DebugFlags().opencl.mem_limit) { + max_buffer_size = min(max_buffer_size, + cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); + } + VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2); - int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements)); + int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; } diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 642c1bfa11c..7d5173a5f1d 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -241,9 +241,9 @@ string OpenCLCache::get_kernel_md5() } OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device, - string program_name, - string kernel_file, - string kernel_build_options, + const string& program_name, + const string& kernel_file, + const string& kernel_build_options, bool use_stdout) : device(device), program_name(program_name), @@ -274,7 +274,7 @@ void OpenCLDeviceBase::OpenCLProgram::release() } } -void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug) +void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug) { if(!use_stdout) { log += msg + "\n"; @@ -288,7 +288,7 @@ void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug) } } -void OpenCLDeviceBase::OpenCLProgram::add_error(string msg) +void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg) { if(use_stdout) { fprintf(stderr, "%s\n", msg.c_str()); @@ -608,6 +608,14 @@ bool OpenCLInfo::device_supported(const string& platform_name, if(!get_device_name(device_id, &device_name)) { return false; } + + int driver_major = 0; + int driver_minor = 0; + if(!get_driver_version(device_id, &driver_major, &driver_minor)) { + return false; + } + VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; + /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework * (aka, it will not be on Intel framework). This isn't supported * and needs an explicit blacklist. @@ -618,6 +626,21 @@ bool OpenCLInfo::device_supported(const string& platform_name, if(platform_name == "AMD Accelerated Parallel Processing" && device_type == CL_DEVICE_TYPE_GPU) { + if(driver_major < 2236) { + VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; + return false; + } + const char *blacklist[] = { + /* GCN 1 */ + "Tahiti", "Pitcairn", "Capeverde", "Oland", + NULL + }; + for(int i = 0; blacklist[i] != NULL; i++) { + if(device_name == blacklist[i]) { + VLOG(1) << "AMD device " << device_name << " not supported"; + return false; + } + } return true; } if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { @@ -684,7 +707,7 @@ bool OpenCLInfo::device_version_check(cl_device_id device, return true; } -string OpenCLInfo::get_hardware_id(string platform_name, cl_device_id device_id) +string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id) { if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { /* Use cl_amd_device_topology extension. */ @@ -1063,7 +1086,7 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id) CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, - &length) == CL_SUCCESS) + &length) == CL_SUCCESS) { if(length != 0 && board_name[0] != '\0') { return board_name; @@ -1073,6 +1096,34 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id) return get_device_name(device_id); } +bool OpenCLInfo::get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int* error) +{ + char buffer[1024]; + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DRIVER_VERSION, + sizeof(buffer), + &buffer, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + if(sscanf(buffer, "%d.%d", major, minor) < 2) { + VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); + return false; + } + return true; +} + int OpenCLInfo::mem_address_alignment(cl_device_id device_id) { int base_align_bits; |