diff options
Diffstat (limited to 'intern/cycles/device/opencl')
-rw-r--r-- | intern/cycles/device/opencl/memory_manager.cpp | 361 | ||||
-rw-r--r-- | intern/cycles/device/opencl/memory_manager.h | 97 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl.h | 1222 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_split.cpp | 3506 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_util.cpp | 1948 |
5 files changed, 3505 insertions, 3629 deletions
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp index 9cb105982aa..f85aadce1c2 100644 --- a/intern/cycles/device/opencl/memory_manager.cpp +++ b/intern/cycles/device/opencl/memory_manager.cpp @@ -16,241 +16,246 @@ #ifdef WITH_OPENCL -#include "util/util_foreach.h" +# include "util/util_foreach.h" -#include "device/opencl/opencl.h" -#include "device/opencl/memory_manager.h" +# include "device/opencl/opencl.h" +# include "device/opencl/memory_manager.h" CCL_NAMESPACE_BEGIN -void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation) +void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation) { - allocations.push_back(&allocation); + allocations.push_back(&allocation); } void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device) { - bool need_realloc = false; - - /* Calculate total size and remove any freed. */ - size_t total_size = 0; - - for(int i = allocations.size()-1; i >= 0; i--) { - Allocation* allocation = allocations[i]; - - /* Remove allocations that have been freed. */ - if(!allocation->mem || allocation->mem->memory_size() == 0) { - allocation->device_buffer = NULL; - allocation->size = 0; - - allocations.erase(allocations.begin()+i); - - need_realloc = true; - - continue; - } - - /* Get actual size for allocation. */ - size_t alloc_size = align_up(allocation->mem->memory_size(), 16); - - if(allocation->size != alloc_size) { - /* Allocation is either new or resized. */ - allocation->size = alloc_size; - allocation->needs_copy_to_device = true; - - need_realloc = true; - } - - total_size += alloc_size; - } - - if(need_realloc) { - cl_ulong max_buffer_size; - clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); - - if(total_size > max_buffer_size) { - device->set_error("Scene too complex to fit in available memory."); - return; - } - - device_only_memory<uchar> *new_buffer = - new device_only_memory<uchar>(device, "memory manager buffer"); - - new_buffer->alloc_to_device(total_size); - - size_t offset = 0; - - foreach(Allocation* allocation, allocations) { - if(allocation->needs_copy_to_device) { - /* Copy from host to device. */ - opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, - CL_MEM_PTR(new_buffer->device_pointer), - CL_FALSE, - offset, - allocation->mem->memory_size(), - allocation->mem->host_pointer, - 0, NULL, NULL - )); - - allocation->needs_copy_to_device = false; - } - else { - /* Fast copy from memory already on device. */ - opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue, - CL_MEM_PTR(buffer->device_pointer), - CL_MEM_PTR(new_buffer->device_pointer), - allocation->desc.offset, - offset, - allocation->mem->memory_size(), - 0, NULL, NULL - )); - } - - allocation->desc.offset = offset; - offset += allocation->size; - } - - delete buffer; - - buffer = new_buffer; - } - else { - assert(total_size == buffer->data_size); - - size_t offset = 0; - - foreach(Allocation* allocation, allocations) { - if(allocation->needs_copy_to_device) { - /* Copy from host to device. */ - opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, - CL_MEM_PTR(buffer->device_pointer), - CL_FALSE, - offset, - allocation->mem->memory_size(), - allocation->mem->host_pointer, - 0, NULL, NULL - )); - - allocation->needs_copy_to_device = false; - } - - offset += allocation->size; - } - } - - /* Not really necessary, but seems to improve responsiveness for some reason. */ - clFinish(device->cqCommandQueue); + bool need_realloc = false; + + /* Calculate total size and remove any freed. */ + size_t total_size = 0; + + for (int i = allocations.size() - 1; i >= 0; i--) { + Allocation *allocation = allocations[i]; + + /* Remove allocations that have been freed. */ + if (!allocation->mem || allocation->mem->memory_size() == 0) { + allocation->device_buffer = NULL; + allocation->size = 0; + + allocations.erase(allocations.begin() + i); + + need_realloc = true; + + continue; + } + + /* Get actual size for allocation. */ + size_t alloc_size = align_up(allocation->mem->memory_size(), 16); + + if (allocation->size != alloc_size) { + /* Allocation is either new or resized. */ + allocation->size = alloc_size; + allocation->needs_copy_to_device = true; + + need_realloc = true; + } + + total_size += alloc_size; + } + + if (need_realloc) { + cl_ulong max_buffer_size; + clGetDeviceInfo( + device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if (total_size > max_buffer_size) { + device->set_error("Scene too complex to fit in available memory."); + return; + } + + device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device, + "memory manager buffer"); + + new_buffer->alloc_to_device(total_size); + + size_t offset = 0; + + foreach (Allocation *allocation, allocations) { + if (allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, + clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(new_buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + allocation->mem->host_pointer, + 0, + NULL, + NULL)); + + allocation->needs_copy_to_device = false; + } + else { + /* Fast copy from memory already on device. */ + opencl_device_assert(device, + clEnqueueCopyBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_MEM_PTR(new_buffer->device_pointer), + allocation->desc.offset, + offset, + allocation->mem->memory_size(), + 0, + NULL, + NULL)); + } + + allocation->desc.offset = offset; + offset += allocation->size; + } + + delete buffer; + + buffer = new_buffer; + } + else { + assert(total_size == buffer->data_size); + + size_t offset = 0; + + foreach (Allocation *allocation, allocations) { + if (allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, + clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + allocation->mem->host_pointer, + 0, + NULL, + NULL)); + + allocation->needs_copy_to_device = false; + } + + offset += allocation->size; + } + } + + /* Not really necessary, but seems to improve responsiveness for some reason. */ + clFinish(device->cqCommandQueue); } void MemoryManager::DeviceBuffer::free(OpenCLDevice *) { - buffer->free(); + buffer->free(); } -MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer() +MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer() { - DeviceBuffer* smallest = device_buffers; + DeviceBuffer *smallest = device_buffers; - foreach(DeviceBuffer& device_buffer, device_buffers) { - if(device_buffer.size < smallest->size) { - smallest = &device_buffer; - } - } + foreach (DeviceBuffer &device_buffer, device_buffers) { + if (device_buffer.size < smallest->size) { + smallest = &device_buffer; + } + } - return smallest; + return smallest; } -MemoryManager::MemoryManager(OpenCLDevice *device) -: device(device), need_update(false) +MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false) { - foreach(DeviceBuffer& device_buffer, device_buffers) { - device_buffer.buffer = - new device_only_memory<uchar>(device, "memory manager buffer"); - } + foreach (DeviceBuffer &device_buffer, device_buffers) { + device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer"); + } } void MemoryManager::free() { - foreach(DeviceBuffer& device_buffer, device_buffers) { - device_buffer.free(device); - } + foreach (DeviceBuffer &device_buffer, device_buffers) { + device_buffer.free(device); + } } -void MemoryManager::alloc(const char *name, device_memory& mem) +void MemoryManager::alloc(const char *name, device_memory &mem) { - Allocation& allocation = allocations[name]; + Allocation &allocation = allocations[name]; - allocation.mem = &mem; - allocation.needs_copy_to_device = true; + allocation.mem = &mem; + allocation.needs_copy_to_device = true; - if(!allocation.device_buffer) { - DeviceBuffer* device_buffer = smallest_device_buffer(); - allocation.device_buffer = device_buffer; + if (!allocation.device_buffer) { + DeviceBuffer *device_buffer = smallest_device_buffer(); + allocation.device_buffer = device_buffer; - allocation.desc.device_buffer = device_buffer - device_buffers; + allocation.desc.device_buffer = device_buffer - device_buffers; - device_buffer->add_allocation(allocation); + device_buffer->add_allocation(allocation); - device_buffer->size += mem.memory_size(); - } + device_buffer->size += mem.memory_size(); + } - need_update = true; + need_update = true; } -bool MemoryManager::free(device_memory& mem) +bool MemoryManager::free(device_memory &mem) { - foreach(AllocationsMap::value_type& value, allocations) { - Allocation& allocation = value.second; - if(allocation.mem == &mem) { + foreach (AllocationsMap::value_type &value, allocations) { + Allocation &allocation = value.second; + if (allocation.mem == &mem) { - allocation.device_buffer->size -= mem.memory_size(); + allocation.device_buffer->size -= mem.memory_size(); - allocation.mem = NULL; - allocation.needs_copy_to_device = false; + allocation.mem = NULL; + allocation.needs_copy_to_device = false; - need_update = true; - return true; - } - } + need_update = true; + return true; + } + } - return false; + return false; } MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name) { - update_device_memory(); + update_device_memory(); - Allocation& allocation = allocations[name]; - return allocation.desc; + Allocation &allocation = allocations[name]; + return allocation.desc; } void MemoryManager::update_device_memory() { - if(!need_update) { - return; - } + if (!need_update) { + return; + } - need_update = false; + need_update = false; - foreach(DeviceBuffer& device_buffer, device_buffers) { - device_buffer.update_device_memory(device); - } + foreach (DeviceBuffer &device_buffer, device_buffers) { + device_buffer.update_device_memory(device); + } } void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) { - update_device_memory(); - - foreach(DeviceBuffer& device_buffer, device_buffers) { - if(device_buffer.buffer->device_pointer) { - device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer); - } - else { - device->kernel_set_args(kernel, (*narg)++, device->null_mem); - } - } + update_device_memory(); + + foreach (DeviceBuffer &device_buffer, device_buffers) { + if (device_buffer.buffer->device_pointer) { + device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer); + } + else { + device->kernel_set_args(kernel, (*narg)++, device->null_mem); + } + } } CCL_NAMESPACE_END -#endif /* WITH_OPENCL */ +#endif /* WITH_OPENCL */ diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h index 8fcc4440369..2fbc97a0756 100644 --- a/intern/cycles/device/opencl/memory_manager.h +++ b/intern/cycles/device/opencl/memory_manager.h @@ -29,78 +29,77 @@ CCL_NAMESPACE_BEGIN class OpenCLDevice; class MemoryManager { -public: - static const int NUM_DEVICE_BUFFERS = 8; + public: + static const int NUM_DEVICE_BUFFERS = 8; - struct BufferDescriptor { - uint device_buffer; - cl_ulong offset; - }; + struct BufferDescriptor { + uint device_buffer; + cl_ulong offset; + }; -private: - struct DeviceBuffer; + private: + struct DeviceBuffer; - struct Allocation { - device_memory *mem; + struct Allocation { + device_memory *mem; - DeviceBuffer *device_buffer; - size_t size; /* Size of actual allocation, may be larger than requested. */ + DeviceBuffer *device_buffer; + size_t size; /* Size of actual allocation, may be larger than requested. */ - BufferDescriptor desc; + BufferDescriptor desc; - bool needs_copy_to_device; + bool needs_copy_to_device; - Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false) - { - } - }; + Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false) + { + } + }; - struct DeviceBuffer { - device_only_memory<uchar> *buffer; - vector<Allocation*> allocations; - size_t size; /* Size of all allocations. */ + struct DeviceBuffer { + device_only_memory<uchar> *buffer; + vector<Allocation *> allocations; + size_t size; /* Size of all allocations. */ - DeviceBuffer() - : buffer(NULL), size(0) - { - } + DeviceBuffer() : buffer(NULL), size(0) + { + } - ~DeviceBuffer() - { - delete buffer; - buffer = NULL; - } + ~DeviceBuffer() + { + delete buffer; + buffer = NULL; + } - void add_allocation(Allocation& allocation); + void add_allocation(Allocation &allocation); - void update_device_memory(OpenCLDevice *device); + void update_device_memory(OpenCLDevice *device); - void free(OpenCLDevice *device); - }; + void free(OpenCLDevice *device); + }; - OpenCLDevice *device; + OpenCLDevice *device; - DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; + DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; - typedef unordered_map<string, Allocation> AllocationsMap; - AllocationsMap allocations; + typedef unordered_map<string, Allocation> AllocationsMap; + AllocationsMap allocations; - bool need_update; + bool need_update; - DeviceBuffer* smallest_device_buffer(); + DeviceBuffer *smallest_device_buffer(); -public: - MemoryManager(OpenCLDevice *device); + public: + MemoryManager(OpenCLDevice *device); - void free(); /* Free all memory. */ + void free(); /* Free all memory. */ - void alloc(const char *name, device_memory& mem); - bool free(device_memory& mem); + void alloc(const char *name, device_memory &mem); + bool free(device_memory &mem); - BufferDescriptor get_descriptor(string name); + BufferDescriptor get_descriptor(string name); - void update_device_memory(); - void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); + void update_device_memory(); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 89761293638..e7bafa0b8a8 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -16,645 +16,641 @@ #ifdef WITH_OPENCL -#include "device/device.h" -#include "device/device_denoising.h" -#include "device/device_split_kernel.h" +# include "device/device.h" +# include "device/device_denoising.h" +# include "device/device_split_kernel.h" -#include "util/util_map.h" -#include "util/util_param.h" -#include "util/util_string.h" +# include "util/util_map.h" +# include "util/util_param.h" +# include "util/util_string.h" -#include "clew.h" +# include "clew.h" -#include "device/opencl/memory_manager.h" +# include "device/opencl/memory_manager.h" CCL_NAMESPACE_BEGIN /* Disable workarounds, seems to be working fine on latest drivers. */ -#define CYCLES_DISABLE_DRIVER_WORKAROUNDS +# define CYCLES_DISABLE_DRIVER_WORKAROUNDS /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */ -#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS +# ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ -# undef clEnqueueNDRangeKernel -# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); +# undef clEnqueueNDRangeKernel +# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ + CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); -# undef clEnqueueWriteBuffer -# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); +# undef clEnqueueWriteBuffer +# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ + CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); -# undef clEnqueueReadBuffer -# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); -#endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ +# undef clEnqueueReadBuffer +# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ + CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); +# endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ -#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) +# define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) struct OpenCLPlatformDevice { - OpenCLPlatformDevice(cl_platform_id platform_id, - const string& platform_name, - cl_device_id device_id, - cl_device_type device_type, - const string& device_name, - const string& hardware_id, - const string& device_extensions) - : platform_id(platform_id), - platform_name(platform_name), - device_id(device_id), - device_type(device_type), - device_name(device_name), - hardware_id(hardware_id), - device_extensions(device_extensions) {} - cl_platform_id platform_id; - string platform_name; - cl_device_id device_id; - cl_device_type device_type; - string device_name; - string hardware_id; - string device_extensions; + OpenCLPlatformDevice(cl_platform_id platform_id, + const string &platform_name, + cl_device_id device_id, + cl_device_type device_type, + const string &device_name, + const string &hardware_id, + const string &device_extensions) + : platform_id(platform_id), + platform_name(platform_name), + device_id(device_id), + device_type(device_type), + device_name(device_name), + hardware_id(hardware_id), + device_extensions(device_extensions) + { + } + cl_platform_id platform_id; + string platform_name; + cl_device_id device_id; + cl_device_type device_type; + string device_name; + string hardware_id; + string device_extensions; }; /* Contains all static OpenCL helper functions. */ -class OpenCLInfo -{ -public: - static cl_device_type device_type(); - static bool use_debug(); - static bool device_supported(const string& platform_name, - const cl_device_id device_id); - static bool platform_version_check(cl_platform_id platform, - string *error = NULL); - static bool device_version_check(cl_device_id device, - string *error = NULL); - static string get_hardware_id(const string& platform_name, - cl_device_id device_id); - static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, - bool force_all = false); - - /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */ - - /* Platform information. */ - static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL); - static cl_uint get_num_platforms(); - - static bool get_platforms(vector<cl_platform_id> *platform_ids, - cl_int *error = NULL); - static vector<cl_platform_id> get_platforms(); - - static bool get_platform_name(cl_platform_id platform_id, - string *platform_name); - static string get_platform_name(cl_platform_id platform_id); - - static bool get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - cl_uint *num_devices, - cl_int *error = NULL); - static cl_uint get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type); - - static bool get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - vector<cl_device_id> *device_ids, - cl_int* error = NULL); - static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type); - - /* Device information. */ - static bool get_device_name(cl_device_id device_id, - string *device_name, - cl_int* error = NULL); - - static string get_device_name(cl_device_id device_id); - - static bool get_device_extensions(cl_device_id device_id, - string *device_extensions, - cl_int* error = NULL); - - static string get_device_extensions(cl_device_id device_id); - - static bool get_device_type(cl_device_id device_id, - cl_device_type *device_type, - cl_int* error = NULL); - static cl_device_type get_device_type(cl_device_id device_id); - - static bool get_driver_version(cl_device_id device_id, - int *major, - int *minor, - cl_int* error = NULL); - - static int mem_sub_ptr_alignment(cl_device_id device_id); - - /* Get somewhat more readable device name. - * Main difference is AMD OpenCL here which only gives code name - * for the regular device name. This will give more sane device - * name using some extensions. - */ - static string get_readable_device_name(cl_device_id device_id); +class OpenCLInfo { + public: + static cl_device_type device_type(); + static bool use_debug(); + static bool device_supported(const string &platform_name, const cl_device_id device_id); + static bool platform_version_check(cl_platform_id platform, string *error = NULL); + static bool device_version_check(cl_device_id device, string *error = NULL); + static string get_hardware_id(const string &platform_name, cl_device_id device_id); + static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, + bool force_all = false); + + /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */ + + /* Platform information. */ + static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL); + static cl_uint get_num_platforms(); + + static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL); + static vector<cl_platform_id> get_platforms(); + + static bool get_platform_name(cl_platform_id platform_id, string *platform_name); + static string get_platform_name(cl_platform_id platform_id); + + static bool get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + cl_uint *num_devices, + cl_int *error = NULL); + static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type); + + static bool get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + vector<cl_device_id> *device_ids, + cl_int *error = NULL); + static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type); + + /* Device information. */ + static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL); + + static string get_device_name(cl_device_id device_id); + + static bool get_device_extensions(cl_device_id device_id, + string *device_extensions, + cl_int *error = NULL); + + static string get_device_extensions(cl_device_id device_id); + + static bool get_device_type(cl_device_id device_id, + cl_device_type *device_type, + cl_int *error = NULL); + static cl_device_type get_device_type(cl_device_id device_id); + + static bool get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int *error = NULL); + + static int mem_sub_ptr_alignment(cl_device_id device_id); + + /* Get somewhat more readable device name. + * Main difference is AMD OpenCL here which only gives code name + * for the regular device name. This will give more sane device + * name using some extensions. + */ + static string get_readable_device_name(cl_device_id device_id); }; /* Thread safe cache for contexts and programs. */ -class OpenCLCache -{ - struct Slot - { - struct ProgramEntry - { - ProgramEntry(); - ProgramEntry(const ProgramEntry& rhs); - ~ProgramEntry(); - cl_program program; - thread_mutex *mutex; - }; - - Slot(); - Slot(const Slot& rhs); - ~Slot(); - - thread_mutex *context_mutex; - cl_context context; - typedef map<ustring, ProgramEntry> EntryMap; - EntryMap programs; - - }; - - /* key is combination of platform ID and device ID */ - typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair; - - /* map of Slot objects */ - typedef map<PlatformDevicePair, Slot> CacheMap; - CacheMap cache; - - /* MD5 hash of the kernel source. */ - string kernel_md5; - - thread_mutex cache_lock; - thread_mutex kernel_md5_lock; - - /* lazy instantiate */ - static OpenCLCache& global_instance(); - -public: - - enum ProgramName { - OCL_DEV_BASE_PROGRAM, - OCL_DEV_MEGAKERNEL_PROGRAM, - }; - - /* Lookup context in the cache. If this returns NULL, slot_locker - * will be holding a lock for the cache. slot_locker should refer to a - * default constructed thread_scoped_lock. */ - static cl_context get_context(cl_platform_id platform, - cl_device_id device, - thread_scoped_lock& slot_locker); - /* Same as above. */ - static cl_program get_program(cl_platform_id platform, - cl_device_id device, - ustring key, - thread_scoped_lock& slot_locker); - - /* Store context in the cache. You MUST have tried to get the item before storing to it. */ - static void store_context(cl_platform_id platform, - cl_device_id device, - cl_context context, - thread_scoped_lock& slot_locker); - /* Same as above. */ - static void store_program(cl_platform_id platform, - cl_device_id device, - cl_program program, - ustring key, - thread_scoped_lock& slot_locker); - - static string get_kernel_md5(); +class OpenCLCache { + struct Slot { + struct ProgramEntry { + ProgramEntry(); + ProgramEntry(const ProgramEntry &rhs); + ~ProgramEntry(); + cl_program program; + thread_mutex *mutex; + }; + + Slot(); + Slot(const Slot &rhs); + ~Slot(); + + thread_mutex *context_mutex; + cl_context context; + typedef map<ustring, ProgramEntry> EntryMap; + EntryMap programs; + }; + + /* key is combination of platform ID and device ID */ + typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair; + + /* map of Slot objects */ + typedef map<PlatformDevicePair, Slot> CacheMap; + CacheMap cache; + + /* MD5 hash of the kernel source. */ + string kernel_md5; + + thread_mutex cache_lock; + thread_mutex kernel_md5_lock; + + /* lazy instantiate */ + static OpenCLCache &global_instance(); + + public: + enum ProgramName { + OCL_DEV_BASE_PROGRAM, + OCL_DEV_MEGAKERNEL_PROGRAM, + }; + + /* Lookup context in the cache. If this returns NULL, slot_locker + * will be holding a lock for the cache. slot_locker should refer to a + * default constructed thread_scoped_lock. */ + static cl_context get_context(cl_platform_id platform, + cl_device_id device, + thread_scoped_lock &slot_locker); + /* Same as above. */ + static cl_program get_program(cl_platform_id platform, + cl_device_id device, + ustring key, + thread_scoped_lock &slot_locker); + + /* Store context in the cache. You MUST have tried to get the item before storing to it. */ + static void store_context(cl_platform_id platform, + cl_device_id device, + cl_context context, + thread_scoped_lock &slot_locker); + /* Same as above. */ + static void store_program(cl_platform_id platform, + cl_device_id device, + cl_program program, + ustring key, + thread_scoped_lock &slot_locker); + + static string get_kernel_md5(); }; -#define opencl_device_assert(device, stmt) \ - { \ - cl_int err = stmt; \ - \ - if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ - if((device)->error_message() == "") \ - (device)->set_error(message); \ - fprintf(stderr, "%s\n", message.c_str()); \ - } \ - } (void) 0 - -#define opencl_assert(stmt) \ - { \ - cl_int err = stmt; \ - \ - if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ - if(error_msg == "") \ - error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - } \ - } (void) 0 - -class OpenCLDevice : public Device -{ -public: - DedicatedTaskPool task_pool; - - /* Task pool for required kernels (base, AO kernels during foreground rendering) */ - TaskPool load_required_kernel_task_pool; - /* Task pool for optional kernels (feature kernels during foreground rendering) */ - TaskPool load_kernel_task_pool; - cl_context cxContext; - cl_command_queue cqCommandQueue; - cl_platform_id cpPlatform; - cl_device_id cdDevice; - cl_int ciErr; - int device_num; - bool use_preview_kernels; - - class OpenCLProgram { - public: - OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL) {} - OpenCLProgram(OpenCLDevice *device, - const string& program_name, - const string& kernel_name, - const string& kernel_build_options, - bool use_stdout = true); - ~OpenCLProgram(); - - void add_kernel(ustring name); - - /* Try to load the program from device cache or disk */ - bool load(); - /* Compile the kernel (first separate, failback to local) */ - void compile(); - /* Create the OpenCL kernels after loading or compiling */ - void create_kernels(); - - bool is_loaded() const { return loaded; } - const string& get_log() const { return log; } - void report_error(); - - /* Wait until this kernel is available to be used - * It will return true when the kernel is available. - * It will return false when the kernel is not available - * or could not be loaded. */ - bool wait_for_availability(); - - cl_kernel operator()(); - cl_kernel operator()(ustring name); - - void release(); - - private: - bool build_kernel(const string *debug_src); - /* Build the program by calling the own process. - * This is required for multithreaded OpenCL compilation, since most Frameworks serialize - * build calls internally if they come from the same process. - * If that is not supported, this function just returns false. - */ - bool compile_separate(const string& clbin); - /* Build the program by calling OpenCL directly. */ - bool compile_kernel(const string *debug_src); - /* Loading and saving the program from/to disk. */ - bool load_binary(const string& clbin, const string *debug_src = NULL); - bool save_binary(const string& clbin); - - void add_log(const string& msg, bool is_debug); - void add_error(const string& msg); - - bool loaded; - bool needs_compiling; - - cl_program program; - OpenCLDevice *device; - - /* Used for the OpenCLCache key. */ - string program_name; - - string kernel_file, kernel_build_options, device_md5; - - bool use_stdout; - string log, error_msg; - string compile_output; - - map<ustring, cl_kernel> kernels; - }; - - /* Container for all types of split programs. */ - class OpenCLSplitPrograms { - public: - OpenCLDevice *device; - OpenCLProgram program_split; - OpenCLProgram program_lamp_emission; - OpenCLProgram program_do_volume; - OpenCLProgram program_indirect_background; - OpenCLProgram program_shader_eval; - OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; - OpenCLProgram program_subsurface_scatter; - OpenCLProgram program_direct_lighting; - OpenCLProgram program_shadow_blocked_ao; - OpenCLProgram program_shadow_blocked_dl; - - OpenCLSplitPrograms(OpenCLDevice *device); - ~OpenCLSplitPrograms(); - - /* Load the kernels and put the created kernels in the given `programs` - * paramter. */ - void load_kernels(vector<OpenCLProgram*> &programs, - const DeviceRequestedFeatures& requested_features, - bool is_preview=false); - }; - - DeviceSplitKernel *split_kernel; - - OpenCLProgram base_program; - OpenCLProgram bake_program; - OpenCLProgram displace_program; - OpenCLProgram background_program; - OpenCLProgram denoising_program; - - OpenCLSplitPrograms kernel_programs; - OpenCLSplitPrograms preview_programs; - - typedef map<string, device_vector<uchar>*> ConstMemMap; - typedef map<string, device_ptr> MemMap; - - ConstMemMap const_mem_map; - MemMap mem_map; - device_ptr null_mem; - - bool device_initialized; - string platform_name; - string device_name; - - bool opencl_error(cl_int err); - void opencl_error(const string& message); - void opencl_assert_err(cl_int err, const char* where); - - OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background); - ~OpenCLDevice(); - - static void CL_CALLBACK context_notify_callback(const char *err_info, - const void * /*private_info*/, size_t /*cb*/, void *user_data); - - bool opencl_version_check(); - OpenCLSplitPrograms* get_split_programs(); - - string device_md5_hash(string kernel_custom_build_options = ""); - bool load_kernels(const DeviceRequestedFeatures& requested_features); - void load_required_kernels(const DeviceRequestedFeatures& requested_features); - void load_preview_kernels(); - - bool wait_for_availability(const DeviceRequestedFeatures& requested_features); - DeviceKernelStatus get_active_kernel_switch_state(); - - /* Get the name of the opencl program for the given kernel */ - const string get_opencl_program_name(const string& kernel_name); - /* Get the program file name to compile (*.cl) for the given kernel */ - const string get_opencl_program_filename(const string& kernel_name); - string get_build_options(const DeviceRequestedFeatures& requested_features, - const string& opencl_program_name, - bool preview_kernel=false); - /* Enable the default features to reduce recompilation events */ - void enable_default_features(DeviceRequestedFeatures& features); - - void mem_alloc(device_memory& mem); - void mem_copy_to(device_memory& mem); - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem); - void mem_zero(device_memory& mem); - void mem_free(device_memory& mem); - - int mem_sub_ptr_alignment(); - - void const_copy_to(const char *name, void *host, size_t size); - void tex_alloc(device_memory& mem); - void tex_free(device_memory& mem); - - size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, - bool x_workgroups = false, - size_t max_workgroup_size = -1); - void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); - void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); - - void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); - void shader(DeviceTask& task); - - void denoise(RenderTile& tile, DenoisingTask& denoising); - - class OpenCLDeviceTask : public DeviceTask { - public: - OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task) - : DeviceTask(task) - { - run = function_bind(&OpenCLDevice::thread_run, - device, - this); - } - }; - - int get_split_task_count(DeviceTask& /*task*/) - { - return 1; - } - - void task_add(DeviceTask& task) - { - task_pool.push(new OpenCLDeviceTask(this, task)); - } - - void task_wait() - { - task_pool.wait(); - } - - void task_cancel() - { - task_pool.cancel(); - } - - void thread_run(DeviceTask *task); - - virtual BVHLayoutMask get_bvh_layout_mask() const { - return BVH_LAYOUT_BVH2; - } - - virtual bool show_samples() const { - return true; - } - - -protected: - string kernel_build_options(const string *debug_src = NULL); - - void mem_zero_kernel(device_ptr ptr, size_t size); - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task); - bool denoising_construct_transform(DenoisingTask *task); - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task); - bool denoising_solve(device_ptr output_ptr, - DenoisingTask *task); - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, int4 rect, - DenoisingTask *task); - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task); - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task); - bool denoising_write_feature(int to_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task); - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task); - - device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size); - void mem_free_sub_ptr(device_ptr ptr); - - class ArgumentWrapper { - public: - ArgumentWrapper() : size(0), pointer(NULL) - { - } - - ArgumentWrapper(device_memory& argument) : size(sizeof(void*)), - pointer((void*)(&argument.device_pointer)) - { - } - - template<typename T> - ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)), - pointer((void*)(&argument.device_pointer)) - { - } - - template<typename T> - ArgumentWrapper(device_only_memory<T>& argument) : size(sizeof(void*)), - pointer((void*)(&argument.device_pointer)) - { - } - template<typename T> - ArgumentWrapper(T& argument) : size(sizeof(argument)), - pointer(&argument) - { - } - - ArgumentWrapper(int argument) : size(sizeof(int)), - int_value(argument), - pointer(&int_value) - { - } - - ArgumentWrapper(float argument) : size(sizeof(float)), - float_value(argument), - pointer(&float_value) - { - } - - size_t size; - int int_value; - float float_value; - void *pointer; - }; - - /* TODO(sergey): In the future we can use variadic templates, once - * C++0x is allowed. Should allow to clean this up a bit. - */ - int kernel_set_args(cl_kernel kernel, - int start_argument_index, - const ArgumentWrapper& arg1 = ArgumentWrapper(), - const ArgumentWrapper& arg2 = ArgumentWrapper(), - const ArgumentWrapper& arg3 = ArgumentWrapper(), - const ArgumentWrapper& arg4 = ArgumentWrapper(), - const ArgumentWrapper& arg5 = ArgumentWrapper(), - const ArgumentWrapper& arg6 = ArgumentWrapper(), - const ArgumentWrapper& arg7 = ArgumentWrapper(), - const ArgumentWrapper& arg8 = ArgumentWrapper(), - const ArgumentWrapper& arg9 = ArgumentWrapper(), - const ArgumentWrapper& arg10 = ArgumentWrapper(), - const ArgumentWrapper& arg11 = ArgumentWrapper(), - const ArgumentWrapper& arg12 = ArgumentWrapper(), - const ArgumentWrapper& arg13 = ArgumentWrapper(), - const ArgumentWrapper& arg14 = ArgumentWrapper(), - const ArgumentWrapper& arg15 = ArgumentWrapper(), - const ArgumentWrapper& arg16 = ArgumentWrapper(), - const ArgumentWrapper& arg17 = ArgumentWrapper(), - const ArgumentWrapper& arg18 = ArgumentWrapper(), - const ArgumentWrapper& arg19 = ArgumentWrapper(), - const ArgumentWrapper& arg20 = ArgumentWrapper(), - const ArgumentWrapper& arg21 = ArgumentWrapper(), - const ArgumentWrapper& arg22 = ArgumentWrapper(), - const ArgumentWrapper& arg23 = ArgumentWrapper(), - const ArgumentWrapper& arg24 = ArgumentWrapper(), - const ArgumentWrapper& arg25 = ArgumentWrapper(), - const ArgumentWrapper& arg26 = ArgumentWrapper(), - const ArgumentWrapper& arg27 = ArgumentWrapper(), - const ArgumentWrapper& arg28 = ArgumentWrapper(), - const ArgumentWrapper& arg29 = ArgumentWrapper(), - const ArgumentWrapper& arg30 = ArgumentWrapper(), - const ArgumentWrapper& arg31 = ArgumentWrapper(), - const ArgumentWrapper& arg32 = ArgumentWrapper(), - const ArgumentWrapper& arg33 = ArgumentWrapper()); - - void release_kernel_safe(cl_kernel kernel); - void release_mem_object_safe(cl_mem mem); - void release_program_safe(cl_program program); - - /* ** Those guys are for workign around some compiler-specific bugs ** */ - - cl_program load_cached_kernel( - ustring key, - thread_scoped_lock& cache_locker); - - void store_cached_kernel( - cl_program program, - ustring key, - thread_scoped_lock& cache_locker); - -private: - MemoryManager memory_manager; - friend class MemoryManager; - - static_assert_align(TextureInfo, 16); - device_vector<TextureInfo> texture_info; - - typedef map<string, device_memory*> TexturesMap; - TexturesMap textures; - - bool textures_need_update; - -protected: - void flush_texture_buffers(); - - friend class OpenCLSplitKernel; - friend class OpenCLSplitKernelFunction; +# define opencl_device_assert(device, stmt) \ + { \ + cl_int err = stmt; \ +\ + if (err != CL_SUCCESS) { \ + string message = string_printf( \ + "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ + if ((device)->error_message() == "") \ + (device)->set_error(message); \ + fprintf(stderr, "%s\n", message.c_str()); \ + } \ + } \ + (void)0 + +# define opencl_assert(stmt) \ + { \ + cl_int err = stmt; \ +\ + if (err != CL_SUCCESS) { \ + string message = string_printf( \ + "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ + if (error_msg == "") \ + error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + } \ + } \ + (void)0 + +class OpenCLDevice : public Device { + public: + DedicatedTaskPool task_pool; + + /* Task pool for required kernels (base, AO kernels during foreground rendering) */ + TaskPool load_required_kernel_task_pool; + /* Task pool for optional kernels (feature kernels during foreground rendering) */ + TaskPool load_kernel_task_pool; + cl_context cxContext; + cl_command_queue cqCommandQueue; + cl_platform_id cpPlatform; + cl_device_id cdDevice; + cl_int ciErr; + int device_num; + bool use_preview_kernels; + + class OpenCLProgram { + public: + OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL) + { + } + OpenCLProgram(OpenCLDevice *device, + const string &program_name, + const string &kernel_name, + const string &kernel_build_options, + bool use_stdout = true); + ~OpenCLProgram(); + + void add_kernel(ustring name); + + /* Try to load the program from device cache or disk */ + bool load(); + /* Compile the kernel (first separate, failback to local) */ + void compile(); + /* Create the OpenCL kernels after loading or compiling */ + void create_kernels(); + + bool is_loaded() const + { + return loaded; + } + const string &get_log() const + { + return log; + } + void report_error(); + + /* Wait until this kernel is available to be used + * It will return true when the kernel is available. + * It will return false when the kernel is not available + * or could not be loaded. */ + bool wait_for_availability(); + + cl_kernel operator()(); + cl_kernel operator()(ustring name); + + void release(); + + private: + bool build_kernel(const string *debug_src); + /* Build the program by calling the own process. + * This is required for multithreaded OpenCL compilation, since most Frameworks serialize + * build calls internally if they come from the same process. + * If that is not supported, this function just returns false. + */ + bool compile_separate(const string &clbin); + /* Build the program by calling OpenCL directly. */ + bool compile_kernel(const string *debug_src); + /* Loading and saving the program from/to disk. */ + bool load_binary(const string &clbin, const string *debug_src = NULL); + bool save_binary(const string &clbin); + + void add_log(const string &msg, bool is_debug); + void add_error(const string &msg); + + bool loaded; + bool needs_compiling; + + cl_program program; + OpenCLDevice *device; + + /* Used for the OpenCLCache key. */ + string program_name; + + string kernel_file, kernel_build_options, device_md5; + + bool use_stdout; + string log, error_msg; + string compile_output; + + map<ustring, cl_kernel> kernels; + }; + + /* Container for all types of split programs. */ + class OpenCLSplitPrograms { + public: + OpenCLDevice *device; + OpenCLProgram program_split; + OpenCLProgram program_lamp_emission; + OpenCLProgram program_do_volume; + OpenCLProgram program_indirect_background; + OpenCLProgram program_shader_eval; + OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; + OpenCLProgram program_subsurface_scatter; + OpenCLProgram program_direct_lighting; + OpenCLProgram program_shadow_blocked_ao; + OpenCLProgram program_shadow_blocked_dl; + + OpenCLSplitPrograms(OpenCLDevice *device); + ~OpenCLSplitPrograms(); + + /* Load the kernels and put the created kernels in the given `programs` + * paramter. */ + void load_kernels(vector<OpenCLProgram *> &programs, + const DeviceRequestedFeatures &requested_features, + bool is_preview = false); + }; + + DeviceSplitKernel *split_kernel; + + OpenCLProgram base_program; + OpenCLProgram bake_program; + OpenCLProgram displace_program; + OpenCLProgram background_program; + OpenCLProgram denoising_program; + + OpenCLSplitPrograms kernel_programs; + OpenCLSplitPrograms preview_programs; + + typedef map<string, device_vector<uchar> *> ConstMemMap; + typedef map<string, device_ptr> MemMap; + + ConstMemMap const_mem_map; + MemMap mem_map; + device_ptr null_mem; + + bool device_initialized; + string platform_name; + string device_name; + + bool opencl_error(cl_int err); + void opencl_error(const string &message); + void opencl_assert_err(cl_int err, const char *where); + + OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); + ~OpenCLDevice(); + + static void CL_CALLBACK context_notify_callback(const char *err_info, + const void * /*private_info*/, + size_t /*cb*/, + void *user_data); + + bool opencl_version_check(); + OpenCLSplitPrograms *get_split_programs(); + + string device_md5_hash(string kernel_custom_build_options = ""); + bool load_kernels(const DeviceRequestedFeatures &requested_features); + void load_required_kernels(const DeviceRequestedFeatures &requested_features); + void load_preview_kernels(); + + bool wait_for_availability(const DeviceRequestedFeatures &requested_features); + DeviceKernelStatus get_active_kernel_switch_state(); + + /* Get the name of the opencl program for the given kernel */ + const string get_opencl_program_name(const string &kernel_name); + /* Get the program file name to compile (*.cl) for the given kernel */ + const string get_opencl_program_filename(const string &kernel_name); + string get_build_options(const DeviceRequestedFeatures &requested_features, + const string &opencl_program_name, + bool preview_kernel = false); + /* Enable the default features to reduce recompilation events */ + void enable_default_features(DeviceRequestedFeatures &features); + + void mem_alloc(device_memory &mem); + void mem_copy_to(device_memory &mem); + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem); + void mem_zero(device_memory &mem); + void mem_free(device_memory &mem); + + int mem_sub_ptr_alignment(); + + void const_copy_to(const char *name, void *host, size_t size); + void tex_alloc(device_memory &mem); + void tex_free(device_memory &mem); + + size_t global_size_round_up(int group_size, int global_size); + void enqueue_kernel(cl_kernel kernel, + size_t w, + size_t h, + bool x_workgroups = false, + size_t max_workgroup_size = -1); + void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); + + void film_convert(DeviceTask &task, + device_ptr buffer, + device_ptr rgba_byte, + device_ptr rgba_half); + void shader(DeviceTask &task); + + void denoise(RenderTile &tile, DenoisingTask &denoising); + + class OpenCLDeviceTask : public DeviceTask { + public: + OpenCLDeviceTask(OpenCLDevice *device, DeviceTask &task) : DeviceTask(task) + { + run = function_bind(&OpenCLDevice::thread_run, device, this); + } + }; + + int get_split_task_count(DeviceTask & /*task*/) + { + return 1; + } + + void task_add(DeviceTask &task) + { + task_pool.push(new OpenCLDeviceTask(this, task)); + } + + void task_wait() + { + task_pool.wait(); + } + + void task_cancel() + { + task_pool.cancel(); + } + + void thread_run(DeviceTask *task); + + virtual BVHLayoutMask get_bvh_layout_mask() const + { + return BVH_LAYOUT_BVH2; + } + + virtual bool show_samples() const + { + return true; + } + + protected: + string kernel_build_options(const string *debug_src = NULL); + + void mem_zero_kernel(device_ptr ptr, size_t size); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task); + bool denoising_construct_transform(DenoisingTask *task); + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task); + bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect, + DenoisingTask *task); + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale, + DenoisingTask *task); + bool denoising_write_feature(int to_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task); + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task); + + device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size); + void mem_free_sub_ptr(device_ptr ptr); + + class ArgumentWrapper { + public: + ArgumentWrapper() : size(0), pointer(NULL) + { + } + + ArgumentWrapper(device_memory &argument) + : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) + { + } + + template<typename T> + ArgumentWrapper(device_vector<T> &argument) + : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) + { + } + + template<typename T> + ArgumentWrapper(device_only_memory<T> &argument) + : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) + { + } + template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument) + { + } + + ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value) + { + } + + ArgumentWrapper(float argument) + : size(sizeof(float)), float_value(argument), pointer(&float_value) + { + } + + size_t size; + int int_value; + float float_value; + void *pointer; + }; + + /* TODO(sergey): In the future we can use variadic templates, once + * C++0x is allowed. Should allow to clean this up a bit. + */ + int kernel_set_args(cl_kernel kernel, + int start_argument_index, + const ArgumentWrapper &arg1 = ArgumentWrapper(), + const ArgumentWrapper &arg2 = ArgumentWrapper(), + const ArgumentWrapper &arg3 = ArgumentWrapper(), + const ArgumentWrapper &arg4 = ArgumentWrapper(), + const ArgumentWrapper &arg5 = ArgumentWrapper(), + const ArgumentWrapper &arg6 = ArgumentWrapper(), + const ArgumentWrapper &arg7 = ArgumentWrapper(), + const ArgumentWrapper &arg8 = ArgumentWrapper(), + const ArgumentWrapper &arg9 = ArgumentWrapper(), + const ArgumentWrapper &arg10 = ArgumentWrapper(), + const ArgumentWrapper &arg11 = ArgumentWrapper(), + const ArgumentWrapper &arg12 = ArgumentWrapper(), + const ArgumentWrapper &arg13 = ArgumentWrapper(), + const ArgumentWrapper &arg14 = ArgumentWrapper(), + const ArgumentWrapper &arg15 = ArgumentWrapper(), + const ArgumentWrapper &arg16 = ArgumentWrapper(), + const ArgumentWrapper &arg17 = ArgumentWrapper(), + const ArgumentWrapper &arg18 = ArgumentWrapper(), + const ArgumentWrapper &arg19 = ArgumentWrapper(), + const ArgumentWrapper &arg20 = ArgumentWrapper(), + const ArgumentWrapper &arg21 = ArgumentWrapper(), + const ArgumentWrapper &arg22 = ArgumentWrapper(), + const ArgumentWrapper &arg23 = ArgumentWrapper(), + const ArgumentWrapper &arg24 = ArgumentWrapper(), + const ArgumentWrapper &arg25 = ArgumentWrapper(), + const ArgumentWrapper &arg26 = ArgumentWrapper(), + const ArgumentWrapper &arg27 = ArgumentWrapper(), + const ArgumentWrapper &arg28 = ArgumentWrapper(), + const ArgumentWrapper &arg29 = ArgumentWrapper(), + const ArgumentWrapper &arg30 = ArgumentWrapper(), + const ArgumentWrapper &arg31 = ArgumentWrapper(), + const ArgumentWrapper &arg32 = ArgumentWrapper(), + const ArgumentWrapper &arg33 = ArgumentWrapper()); + + void release_kernel_safe(cl_kernel kernel); + void release_mem_object_safe(cl_mem mem); + void release_program_safe(cl_program program); + + /* ** Those guys are for workign around some compiler-specific bugs ** */ + + cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker); + + void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker); + + private: + MemoryManager memory_manager; + friend class MemoryManager; + + static_assert_align(TextureInfo, 16); + device_vector<TextureInfo> texture_info; + + typedef map<string, device_memory *> TexturesMap; + TexturesMap textures; + + bool textures_need_update; + + protected: + void flush_texture_buffers(); + + friend class OpenCLSplitKernel; + friend class OpenCLSplitKernelFunction; }; -Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background); +Device *opencl_create_split_device(DeviceInfo &info, + Stats &stats, + Profiler &profiler, + bool background); CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 489d10b7087..70b1a643044 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -16,273 +16,278 @@ #ifdef WITH_OPENCL -#include "device/opencl/opencl.h" +# include "device/opencl/opencl.h" -#include "kernel/kernel_types.h" -#include "kernel/split/kernel_split_data_types.h" +# include "kernel/kernel_types.h" +# include "kernel/split/kernel_split_data_types.h" -#include "util/util_algorithm.h" -#include "util/util_debug.h" -#include "util/util_foreach.h" -#include "util/util_logging.h" -#include "util/util_md5.h" -#include "util/util_path.h" -#include "util/util_time.h" +# include "util/util_algorithm.h" +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_md5.h" +# include "util/util_path.h" +# include "util/util_time.h" CCL_NAMESPACE_BEGIN struct texture_slot_t { - texture_slot_t(const string& name, int slot) - : name(name), - slot(slot) { - } - string name; - int slot; + texture_slot_t(const string &name, int slot) : name(name), slot(slot) + { + } + string name; + int slot; }; static const string NON_SPLIT_KERNELS = - "denoising " - "base " - "background " - "displace "; + "denoising " + "base " + "background " + "displace "; static const string SPLIT_BUNDLE_KERNELS = - "data_init " - "path_init " - "state_buffer_size " - "scene_intersect " - "queue_enqueue " - "shader_setup " - "shader_sort " - "enqueue_inactive " - "next_iteration_setup " - "indirect_subsurface " - "buffer_update"; - -const string OpenCLDevice::get_opencl_program_name(const string& kernel_name) + "data_init " + "path_init " + "state_buffer_size " + "scene_intersect " + "queue_enqueue " + "shader_setup " + "shader_sort " + "enqueue_inactive " + "next_iteration_setup " + "indirect_subsurface " + "buffer_update"; + +const string OpenCLDevice::get_opencl_program_name(const string &kernel_name) { - if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) { - return kernel_name; - } - else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { - return "split_bundle"; - } - else { - return "split_" + kernel_name; - } + if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) { + return kernel_name; + } + else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { + return "split_bundle"; + } + else { + return "split_" + kernel_name; + } } -const string OpenCLDevice::get_opencl_program_filename(const string& kernel_name) +const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name) { - if (kernel_name == "denoising") { - return "filter.cl"; - } - else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { - return "kernel_split_bundle.cl"; - } - else { - return "kernel_" + kernel_name + ".cl"; - } + if (kernel_name == "denoising") { + return "filter.cl"; + } + else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { + return "kernel_split_bundle.cl"; + } + else { + return "kernel_" + kernel_name + ".cl"; + } } /* Enable features that we always want to compile to reduce recompilation events */ -void OpenCLDevice::enable_default_features(DeviceRequestedFeatures& features) +void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features) { - features.use_transparent = true; - features.use_shadow_tricks = true; - features.use_principled = true; - features.use_denoising = true; - - if (!background) - { - features.max_nodes_group = NODE_GROUP_LEVEL_MAX; - features.nodes_features = NODE_FEATURE_ALL; - features.use_hair = true; - features.use_subsurface = true; - features.use_camera_motion = false; - features.use_object_motion = false; - } + features.use_transparent = true; + features.use_shadow_tricks = true; + features.use_principled = true; + features.use_denoising = true; + + if (!background) { + features.max_nodes_group = NODE_GROUP_LEVEL_MAX; + features.nodes_features = NODE_FEATURE_ALL; + features.use_hair = true; + features.use_subsurface = true; + features.use_camera_motion = false; + features.use_object_motion = false; + } } -string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_features, const string& opencl_program_name, bool preview_kernel) +string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features, + const string &opencl_program_name, + bool preview_kernel) { - /* first check for non-split kernel programs */ - if (opencl_program_name == "base" || opencl_program_name == "denoising") { - return ""; - } - else if (opencl_program_name == "bake") { - /* Note: get_build_options for bake is only requested when baking is enabled. - * displace and background are always requested. - * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_denoising = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_hair = true; - features.use_subsurface = true; - features.max_nodes_group = NODE_GROUP_LEVEL_MAX; - features.nodes_features = NODE_FEATURE_ALL; - features.use_integrator_branched = false; - return features.get_build_options(); - } - else if (opencl_program_name == "displace") { - /* As displacement does not use any nodes from the Shading group (eg BSDF). - * We disable all features that are related to shading. */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_denoising = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_baking = false; - features.use_transparent = false; - features.use_shadow_tricks = false; - features.use_subsurface = false; - features.use_volume = false; - features.nodes_features &= ~NODE_FEATURE_VOLUME; - features.use_denoising = false; - features.use_principled = false; - features.use_integrator_branched = false; - return features.get_build_options(); - } - else if (opencl_program_name == "background") { - /* Background uses Background shading - * It is save to disable shadow features, subsurface and volumetric. */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_baking = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_transparent = false; - features.use_shadow_tricks = false; - features.use_denoising = false; - /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node. - * Perhaps we should remove them in UI as it does not make any sense when - * rendering background. */ - features.nodes_features &= ~NODE_FEATURE_VOLUME; - features.use_subsurface = false; - features.use_volume = false; - features.use_shader_raytrace = false; - features.use_patch_evaluation = false; - features.use_integrator_branched = false; - return features.get_build_options(); - } - - string build_options = "-D__SPLIT_KERNEL__ "; - /* Set compute device build option. */ - cl_device_type device_type; - OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr); - assert(this->ciErr == CL_SUCCESS); - if(device_type == CL_DEVICE_TYPE_GPU) { - build_options += "-D__COMPUTE_DEVICE_GPU__ "; - } - - DeviceRequestedFeatures nofeatures; - enable_default_features(nofeatures); - - /* Add program specific optimized compile directives */ - if (preview_kernel) { - DeviceRequestedFeatures preview_features; - preview_features.use_hair = true; - build_options += "-D__KERNEL_AO_PREVIEW__ "; - build_options += preview_features.get_build_options(); - } - else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) { - build_options += nofeatures.get_build_options(); - } - else { - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - - /* Always turn off baking at this point. Baking is only usefull when building the bake kernel. - * this also makes sure that the kernels that are build during baking can be reused - * when not doing any baking. */ - features.use_baking = false; - - /* Do not vary on shaders when program doesn't do any shading. - * We have bundled them in a single program. */ - if (opencl_program_name == "split_bundle") { - features.max_nodes_group = 0; - features.nodes_features = 0; - features.use_shader_raytrace = false; - } - - /* No specific settings, just add the regular ones */ - build_options += features.get_build_options(); - } - - return build_options; + /* first check for non-split kernel programs */ + if (opencl_program_name == "base" || opencl_program_name == "denoising") { + return ""; + } + else if (opencl_program_name == "bake") { + /* Note: get_build_options for bake is only requested when baking is enabled. + * displace and background are always requested. + * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */ + DeviceRequestedFeatures features(requested_features); + enable_default_features(features); + features.use_denoising = false; + features.use_object_motion = false; + features.use_camera_motion = false; + features.use_hair = true; + features.use_subsurface = true; + features.max_nodes_group = NODE_GROUP_LEVEL_MAX; + features.nodes_features = NODE_FEATURE_ALL; + features.use_integrator_branched = false; + return features.get_build_options(); + } + else if (opencl_program_name == "displace") { + /* As displacement does not use any nodes from the Shading group (eg BSDF). + * We disable all features that are related to shading. */ + DeviceRequestedFeatures features(requested_features); + enable_default_features(features); + features.use_denoising = false; + features.use_object_motion = false; + features.use_camera_motion = false; + features.use_baking = false; + features.use_transparent = false; + features.use_shadow_tricks = false; + features.use_subsurface = false; + features.use_volume = false; + features.nodes_features &= ~NODE_FEATURE_VOLUME; + features.use_denoising = false; + features.use_principled = false; + features.use_integrator_branched = false; + return features.get_build_options(); + } + else if (opencl_program_name == "background") { + /* Background uses Background shading + * It is save to disable shadow features, subsurface and volumetric. */ + DeviceRequestedFeatures features(requested_features); + enable_default_features(features); + features.use_baking = false; + features.use_object_motion = false; + features.use_camera_motion = false; + features.use_transparent = false; + features.use_shadow_tricks = false; + features.use_denoising = false; + /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node. + * Perhaps we should remove them in UI as it does not make any sense when + * rendering background. */ + features.nodes_features &= ~NODE_FEATURE_VOLUME; + features.use_subsurface = false; + features.use_volume = false; + features.use_shader_raytrace = false; + features.use_patch_evaluation = false; + features.use_integrator_branched = false; + return features.get_build_options(); + } + + string build_options = "-D__SPLIT_KERNEL__ "; + /* Set compute device build option. */ + cl_device_type device_type; + OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr); + assert(this->ciErr == CL_SUCCESS); + if (device_type == CL_DEVICE_TYPE_GPU) { + build_options += "-D__COMPUTE_DEVICE_GPU__ "; + } + + DeviceRequestedFeatures nofeatures; + enable_default_features(nofeatures); + + /* Add program specific optimized compile directives */ + if (preview_kernel) { + DeviceRequestedFeatures preview_features; + preview_features.use_hair = true; + build_options += "-D__KERNEL_AO_PREVIEW__ "; + build_options += preview_features.get_build_options(); + } + else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) { + build_options += nofeatures.get_build_options(); + } + else { + DeviceRequestedFeatures features(requested_features); + enable_default_features(features); + + /* Always turn off baking at this point. Baking is only usefull when building the bake kernel. + * this also makes sure that the kernels that are build during baking can be reused + * when not doing any baking. */ + features.use_baking = false; + + /* Do not vary on shaders when program doesn't do any shading. + * We have bundled them in a single program. */ + if (opencl_program_name == "split_bundle") { + features.max_nodes_group = 0; + features.nodes_features = 0; + features.use_shader_raytrace = false; + } + + /* No specific settings, just add the regular ones */ + build_options += features.get_build_options(); + } + + return build_options; } OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_) { - device = device_; + device = device_; } OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms() { - program_split.release(); - program_lamp_emission.release(); - program_do_volume.release(); - program_indirect_background.release(); - program_shader_eval.release(); - program_holdout_emission_blurring_pathtermination_ao.release(); - program_subsurface_scatter.release(); - program_direct_lighting.release(); - program_shadow_blocked_ao.release(); - program_shadow_blocked_dl.release(); + program_split.release(); + program_lamp_emission.release(); + program_do_volume.release(); + program_indirect_background.release(); + program_shader_eval.release(); + program_holdout_emission_blurring_pathtermination_ao.release(); + program_subsurface_scatter.release(); + program_direct_lighting.release(); + program_shadow_blocked_ao.release(); + program_shadow_blocked_dl.release(); } -void OpenCLDevice::OpenCLSplitPrograms::load_kernels(vector<OpenCLProgram*> &programs, const DeviceRequestedFeatures& requested_features, bool is_preview) +void OpenCLDevice::OpenCLSplitPrograms::load_kernels( + vector<OpenCLProgram *> &programs, + const DeviceRequestedFeatures &requested_features, + bool is_preview) { - if (!requested_features.use_baking) { -#define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name)); -#define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \ - const string program_name_##kernel_name = "split_"#kernel_name; \ - program_##kernel_name = \ - OpenCLDevice::OpenCLProgram(device, \ - program_name_##kernel_name, \ - "kernel_"#kernel_name".cl", \ - device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \ - program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \ - programs.push_back(&program_##kernel_name); - - /* Ordered with most complex kernels first, to reduce overall compile time. */ - ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter); - if (requested_features.use_volume || is_preview) { - ADD_SPLIT_KERNEL_PROGRAM(do_volume); - } - ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl); - ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao); - ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao); - ADD_SPLIT_KERNEL_PROGRAM(lamp_emission); - ADD_SPLIT_KERNEL_PROGRAM(direct_lighting); - ADD_SPLIT_KERNEL_PROGRAM(indirect_background); - ADD_SPLIT_KERNEL_PROGRAM(shader_eval); - - /* Quick kernels bundled in a single program to reduce overhead of starting - * Blender processes. */ - program_split = OpenCLDevice::OpenCLProgram(device, - "split_bundle" , - "kernel_split_bundle.cl", - device->get_build_options(requested_features, "split_bundle", is_preview)); - - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update); - programs.push_back(&program_split); - -#undef ADD_SPLIT_KERNEL_PROGRAM -#undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM - } + if (!requested_features.use_baking) { +# define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \ + program_split.add_kernel(ustring("path_trace_" #kernel_name)); +# define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \ + const string program_name_##kernel_name = "split_" #kernel_name; \ + program_##kernel_name = OpenCLDevice::OpenCLProgram( \ + device, \ + program_name_##kernel_name, \ + "kernel_" #kernel_name ".cl", \ + device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \ + program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \ + programs.push_back(&program_##kernel_name); + + /* Ordered with most complex kernels first, to reduce overall compile time. */ + ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter); + if (requested_features.use_volume || is_preview) { + ADD_SPLIT_KERNEL_PROGRAM(do_volume); + } + ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl); + ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao); + ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao); + ADD_SPLIT_KERNEL_PROGRAM(lamp_emission); + ADD_SPLIT_KERNEL_PROGRAM(direct_lighting); + ADD_SPLIT_KERNEL_PROGRAM(indirect_background); + ADD_SPLIT_KERNEL_PROGRAM(shader_eval); + + /* Quick kernels bundled in a single program to reduce overhead of starting + * Blender processes. */ + program_split = OpenCLDevice::OpenCLProgram( + device, + "split_bundle", + "kernel_split_bundle.cl", + device->get_build_options(requested_features, "split_bundle", is_preview)); + + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update); + programs.push_back(&program_split); + +# undef ADD_SPLIT_KERNEL_PROGRAM +# undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM + } } namespace { @@ -291,1126 +296,1108 @@ namespace { * fetch its size. */ typedef struct KernelGlobalsDummy { - ccl_constant KernelData *data; - ccl_global char *buffers[8]; + ccl_constant KernelData *data; + ccl_global char *buffers[8]; -#define KERNEL_TEX(type, name) \ - TextureInfo name; +# define KERNEL_TEX(type, name) TextureInfo name; # include "kernel/kernel_textures.h" -#undef KERNEL_TEX - SplitData split_data; - SplitParams split_param_data; +# undef KERNEL_TEX + SplitData split_data; + SplitParams split_param_data; } KernelGlobalsDummy; } // namespace - struct CachedSplitMemory { - int id; - device_memory *split_data; - device_memory *ray_state; - device_memory *queue_index; - device_memory *use_queues_flag; - device_memory *work_pools; - device_ptr *buffer; + int id; + device_memory *split_data; + device_memory *ray_state; + device_memory *queue_index; + device_memory *use_queues_flag; + device_memory *work_pools; + device_ptr *buffer; }; class OpenCLSplitKernelFunction : public SplitKernelFunction { -public: - OpenCLDevice* device; - OpenCLDevice::OpenCLProgram program; - CachedSplitMemory& cached_memory; - int cached_id; - - OpenCLSplitKernelFunction(OpenCLDevice* device, CachedSplitMemory& cached_memory) : - device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1) - { - } - - ~OpenCLSplitKernelFunction() - { - program.release(); - } - - virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) - { - if(cached_id != cached_memory.id) { - cl_uint start_arg_index = - device->kernel_set_args(program(), - 0, - kg, - data, - *cached_memory.split_data, - *cached_memory.ray_state); - - device->set_kernel_arg_buffers(program(), &start_arg_index); - - start_arg_index += - device->kernel_set_args(program(), - start_arg_index, - *cached_memory.queue_index, - *cached_memory.use_queues_flag, - *cached_memory.work_pools, - *cached_memory.buffer); - - cached_id = cached_memory.id; - } - - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - program(), - 2, - NULL, - dim.global_size, - dim.local_size, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - if(device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return false; - } - - return true; - } + public: + OpenCLDevice *device; + OpenCLDevice::OpenCLProgram program; + CachedSplitMemory &cached_memory; + int cached_id; + + OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory) + : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1) + { + } + + ~OpenCLSplitKernelFunction() + { + program.release(); + } + + virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) + { + if (cached_id != cached_memory.id) { + cl_uint start_arg_index = device->kernel_set_args( + program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state); + + device->set_kernel_arg_buffers(program(), &start_arg_index); + + start_arg_index += device->kernel_set_args(program(), + start_arg_index, + *cached_memory.queue_index, + *cached_memory.use_queues_flag, + *cached_memory.work_pools, + *cached_memory.buffer); + + cached_id = cached_memory.id; + } + + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + program(), + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if (device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return false; + } + + return true; + } }; class OpenCLSplitKernel : public DeviceSplitKernel { - OpenCLDevice *device; - CachedSplitMemory cached_memory; -public: - explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) { - } - - virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, - const DeviceRequestedFeatures& requested_features) - { - OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory); - - const string program_name = device->get_opencl_program_name(kernel_name); - kernel->program = - OpenCLDevice::OpenCLProgram(device, - program_name, - device->get_opencl_program_filename(kernel_name), - device->get_build_options(requested_features, - program_name, - device->use_preview_kernels)); - - kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); - kernel->program.load(); - - if(!kernel->program.is_loaded()) { - delete kernel; - return NULL; - } - - return kernel; - } - - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) - { - device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); - size_buffer.alloc(1); - size_buffer.zero_to_device(); - - uint threads = num_threads; - OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); - cl_kernel kernel_state_buffer_size = programs->program_split(ustring("path_trace_state_buffer_size")); - device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer); - - size_t global_size = 64; - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - kernel_state_buffer_size, - 1, - NULL, - &global_size, - NULL, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - size_buffer.copy_from_device(0, 1, 1); - size_t size = size_buffer[0]; - size_buffer.free(); - - if(device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return 0; - } - - return size; - } - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs - ) - { - cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; - - /* Set the range of samples to be processed for every ray in - * path-regeneration logic. - */ - cl_int start_sample = rtile.start_sample; - cl_int end_sample = rtile.start_sample + rtile.num_samples; - - OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); - cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init")); - - cl_uint start_arg_index = - device->kernel_set_args(kernel_data_init, - 0, - kernel_globals, - kernel_data, - split_data, - num_global_elements, - ray_state); - - device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index); - - start_arg_index += - device->kernel_set_args(kernel_data_init, - start_arg_index, - start_sample, - end_sample, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - queue_index, - dQueue_size, - use_queues_flag, - work_pool_wgs, - rtile.num_samples, - rtile.buffer); - - /* Enqueue ckPathTraceKernel_data_init kernel. */ - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - kernel_data_init, - 2, - NULL, - dim.global_size, - dim.local_size, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - if(device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return false; - } - - cached_memory.split_data = &split_data; - cached_memory.ray_state = &ray_state; - cached_memory.queue_index = &queue_index; - cached_memory.use_queues_flag = &use_queues_flag; - cached_memory.work_pools = &work_pool_wgs; - cached_memory.buffer = &rtile.buffer; - cached_memory.id++; - - return true; - } - - virtual int2 split_kernel_local_size() - { - return make_int2(64, 1); - } - - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) - { - cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); - /* Use small global size on CPU devices as it seems to be much faster. */ - if(type == CL_DEVICE_TYPE_CPU) { - VLOG(1) << "Global size: (64, 64)."; - return make_int2(64, 64); - } - - cl_ulong max_buffer_size; - clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); - - if(DebugFlags().opencl.mem_limit) { - max_buffer_size = min(max_buffer_size, - cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); - } - - VLOG(1) << "Maximum device allocation size: " - << string_human_readable_number(max_buffer_size) << " bytes. (" - << string_human_readable_size(max_buffer_size) << ")."; - - /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ - max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024); - - size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); - int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); - VLOG(1) << "Global size: " << global_size << "."; - return global_size; - } + OpenCLDevice *device; + CachedSplitMemory cached_memory; + + public: + explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) + { + } + + virtual SplitKernelFunction *get_split_kernel_function( + const string &kernel_name, const DeviceRequestedFeatures &requested_features) + { + OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory); + + const string program_name = device->get_opencl_program_name(kernel_name); + kernel->program = OpenCLDevice::OpenCLProgram( + device, + program_name, + device->get_opencl_program_filename(kernel_name), + device->get_build_options(requested_features, program_name, device->use_preview_kernels)); + + kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); + kernel->program.load(); + + if (!kernel->program.is_loaded()) { + delete kernel; + return NULL; + } + + return kernel; + } + + virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads) + { + device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); + size_buffer.alloc(1); + size_buffer.zero_to_device(); + + uint threads = num_threads; + OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); + cl_kernel kernel_state_buffer_size = programs->program_split( + ustring("path_trace_state_buffer_size")); + device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer); + + size_t global_size = 64; + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + kernel_state_buffer_size, + 1, + NULL, + &global_size, + NULL, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + size_buffer.copy_from_device(0, 1, 1); + size_t size = size_buffer[0]; + size_buffer.free(); + + if (device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return 0; + } + + return size; + } + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory &kernel_globals, + device_memory &kernel_data, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs) + { + cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; + + /* Set the range of samples to be processed for every ray in + * path-regeneration logic. + */ + cl_int start_sample = rtile.start_sample; + cl_int end_sample = rtile.start_sample + rtile.num_samples; + + OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); + cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init")); + + cl_uint start_arg_index = device->kernel_set_args(kernel_data_init, + 0, + kernel_globals, + kernel_data, + split_data, + num_global_elements, + ray_state); + + device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index); + + start_arg_index += device->kernel_set_args(kernel_data_init, + start_arg_index, + start_sample, + end_sample, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + queue_index, + dQueue_size, + use_queues_flag, + work_pool_wgs, + rtile.num_samples, + rtile.buffer); + + /* Enqueue ckPathTraceKernel_data_init kernel. */ + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + kernel_data_init, + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if (device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return false; + } + + cached_memory.split_data = &split_data; + cached_memory.ray_state = &ray_state; + cached_memory.queue_index = &queue_index; + cached_memory.use_queues_flag = &use_queues_flag; + cached_memory.work_pools = &work_pool_wgs; + cached_memory.buffer = &rtile.buffer; + cached_memory.id++; + + return true; + } + + virtual int2 split_kernel_local_size() + { + return make_int2(64, 1); + } + + virtual int2 split_kernel_global_size(device_memory &kg, + device_memory &data, + DeviceTask * /*task*/) + { + cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); + /* Use small global size on CPU devices as it seems to be much faster. */ + if (type == CL_DEVICE_TYPE_CPU) { + VLOG(1) << "Global size: (64, 64)."; + return make_int2(64, 64); + } + + cl_ulong max_buffer_size; + clGetDeviceInfo( + device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if (DebugFlags().opencl.mem_limit) { + max_buffer_size = min(max_buffer_size, + cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); + } + + VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) + << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; + + /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ + max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024); + + size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); + int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), + (int)sqrt(num_elements)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; + } }; bool OpenCLDevice::opencl_error(cl_int err) { - if(err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - return true; - } - - return false; + if (err != CL_SUCCESS) { + string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); + return true; + } + + return false; } -void OpenCLDevice::opencl_error(const string& message) +void OpenCLDevice::opencl_error(const string &message) { - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); } -void OpenCLDevice::opencl_assert_err(cl_int err, const char* where) +void OpenCLDevice::opencl_assert_err(cl_int err, const char *where) { - if(err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where); - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); -#ifndef NDEBUG - abort(); -#endif - } + if (err != CL_SUCCESS) { + string message = string_printf( + "OpenCL error (%d): %s in %s", err, clewErrorString(err), where); + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); +# ifndef NDEBUG + abort(); +# endif + } } -OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) -: Device(info, stats, profiler, background), - kernel_programs(this), - preview_programs(this), - memory_manager(this), - texture_info(this, "__texture_info", MEM_TEXTURE) +OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) + : Device(info, stats, profiler, background), + kernel_programs(this), + preview_programs(this), + memory_manager(this), + texture_info(this, "__texture_info", MEM_TEXTURE) { - cpPlatform = NULL; - cdDevice = NULL; - cxContext = NULL; - cqCommandQueue = NULL; - null_mem = 0; - device_initialized = false; - textures_need_update = true; - use_preview_kernels = !background; - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - if(usable_devices.size() == 0) { - opencl_error("OpenCL: no devices found."); - return; - } - assert(info.num < usable_devices.size()); - OpenCLPlatformDevice& platform_device = usable_devices[info.num]; - device_num = info.num; - cpPlatform = platform_device.platform_id; - cdDevice = platform_device.device_id; - platform_name = platform_device.platform_name; - device_name = platform_device.device_name; - VLOG(2) << "Creating new Cycles device for OpenCL platform " - << platform_name << ", device " - << device_name << "."; - - { - /* try to use cached context */ - thread_scoped_lock cache_locker; - cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker); - - if(cxContext == NULL) { - /* create context properties array to specify platform */ - const cl_context_properties context_props[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, - 0, 0 - }; - - /* create context */ - cxContext = clCreateContext(context_props, 1, &cdDevice, - context_notify_callback, cdDevice, &ciErr); - - if(opencl_error(ciErr)) { - opencl_error("OpenCL: clCreateContext failed"); - return; - } - - /* cache it */ - OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker); - } - } - - cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); - if(opencl_error(ciErr)) { - opencl_error("OpenCL: Error creating command queue"); - return; - } - - null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); - if(opencl_error(ciErr)) { - opencl_error("OpenCL: Error creating memory buffer for NULL"); - return; - } - - /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */ - texture_info.resize(1); - memory_manager.alloc("texture_info", texture_info); - - device_initialized = true; - - split_kernel = new OpenCLSplitKernel(this); - if (!background) { - load_preview_kernels(); - } + cpPlatform = NULL; + cdDevice = NULL; + cxContext = NULL; + cqCommandQueue = NULL; + null_mem = 0; + device_initialized = false; + textures_need_update = true; + use_preview_kernels = !background; + + vector<OpenCLPlatformDevice> usable_devices; + OpenCLInfo::get_usable_devices(&usable_devices); + if (usable_devices.size() == 0) { + opencl_error("OpenCL: no devices found."); + return; + } + assert(info.num < usable_devices.size()); + OpenCLPlatformDevice &platform_device = usable_devices[info.num]; + device_num = info.num; + cpPlatform = platform_device.platform_id; + cdDevice = platform_device.device_id; + platform_name = platform_device.platform_name; + device_name = platform_device.device_name; + VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device " + << device_name << "."; + + { + /* try to use cached context */ + thread_scoped_lock cache_locker; + cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker); + + if (cxContext == NULL) { + /* create context properties array to specify platform */ + const cl_context_properties context_props[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0}; + + /* create context */ + cxContext = clCreateContext( + context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr); + + if (opencl_error(ciErr)) { + opencl_error("OpenCL: clCreateContext failed"); + return; + } + + /* cache it */ + OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker); + } + } + + cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); + if (opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating command queue"); + return; + } + + null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); + if (opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating memory buffer for NULL"); + return; + } + + /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */ + texture_info.resize(1); + memory_manager.alloc("texture_info", texture_info); + + device_initialized = true; + + split_kernel = new OpenCLSplitKernel(this); + if (!background) { + load_preview_kernels(); + } } OpenCLDevice::~OpenCLDevice() { - task_pool.stop(); - load_required_kernel_task_pool.stop(); - load_kernel_task_pool.stop(); + task_pool.stop(); + load_required_kernel_task_pool.stop(); + load_kernel_task_pool.stop(); - memory_manager.free(); + memory_manager.free(); - if(null_mem) - clReleaseMemObject(CL_MEM_PTR(null_mem)); + if (null_mem) + clReleaseMemObject(CL_MEM_PTR(null_mem)); - ConstMemMap::iterator mt; - for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { - delete mt->second; - } + ConstMemMap::iterator mt; + for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { + delete mt->second; + } - base_program.release(); - bake_program.release(); - displace_program.release(); - background_program.release(); - denoising_program.release(); + base_program.release(); + bake_program.release(); + displace_program.release(); + background_program.release(); + denoising_program.release(); - if(cqCommandQueue) - clReleaseCommandQueue(cqCommandQueue); - if(cxContext) - clReleaseContext(cxContext); + if (cqCommandQueue) + clReleaseCommandQueue(cqCommandQueue); + if (cxContext) + clReleaseContext(cxContext); - delete split_kernel; + delete split_kernel; } void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info, - const void * /*private_info*/, size_t /*cb*/, void *user_data) + const void * /*private_info*/, + size_t /*cb*/, + void *user_data) { - string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); - fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); + string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); + fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); } bool OpenCLDevice::opencl_version_check() { - string error; - if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) { - opencl_error(error); - return false; - } - if(!OpenCLInfo::device_version_check(cdDevice, &error)) { - opencl_error(error); - return false; - } - return true; + string error; + if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) { + opencl_error(error); + return false; + } + if (!OpenCLInfo::device_version_check(cdDevice, &error)) { + opencl_error(error); + return false; + } + return true; } string OpenCLDevice::device_md5_hash(string kernel_custom_build_options) { - MD5Hash md5; - char version[256], driver[256], name[256], vendor[256]; + MD5Hash md5; + char version[256], driver[256], name[256], vendor[256]; - clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL); - clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL); + clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL); + clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); + clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL); + clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL); - md5.append((uint8_t*)vendor, strlen(vendor)); - md5.append((uint8_t*)version, strlen(version)); - md5.append((uint8_t*)name, strlen(name)); - md5.append((uint8_t*)driver, strlen(driver)); + md5.append((uint8_t *)vendor, strlen(vendor)); + md5.append((uint8_t *)version, strlen(version)); + md5.append((uint8_t *)name, strlen(name)); + md5.append((uint8_t *)driver, strlen(driver)); - string options = kernel_build_options(); - options += kernel_custom_build_options; - md5.append((uint8_t*)options.c_str(), options.size()); + string options = kernel_build_options(); + options += kernel_custom_build_options; + md5.append((uint8_t *)options.c_str(), options.size()); - return md5.get_hex(); + return md5.get_hex(); } -bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures& requested_features) +bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features) { - VLOG(2) << "Loading kernels for platform " << platform_name - << ", device " << device_name << "."; - /* Verify if device was initialized. */ - if(!device_initialized) { - fprintf(stderr, "OpenCL: failed to initialize device.\n"); - return false; - } - - /* Verify we have right opencl version. */ - if(!opencl_version_check()) - return false; - - load_required_kernels(requested_features); - - vector<OpenCLProgram*> programs; - kernel_programs.load_kernels(programs, requested_features, false); - - if (!requested_features.use_baking && requested_features.use_denoising) { - denoising_program = OpenCLProgram(this, "denoising", "filter.cl", get_build_options(requested_features, "denoising")); - denoising_program.add_kernel(ustring("filter_divide_shadow")); - denoising_program.add_kernel(ustring("filter_get_feature")); - denoising_program.add_kernel(ustring("filter_write_feature")); - denoising_program.add_kernel(ustring("filter_detect_outliers")); - denoising_program.add_kernel(ustring("filter_combine_halves")); - denoising_program.add_kernel(ustring("filter_construct_transform")); - denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); - denoising_program.add_kernel(ustring("filter_nlm_blur")); - denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); - denoising_program.add_kernel(ustring("filter_nlm_update_output")); - denoising_program.add_kernel(ustring("filter_nlm_normalize")); - denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); - denoising_program.add_kernel(ustring("filter_finalize")); - programs.push_back(&denoising_program); - } - - load_required_kernel_task_pool.wait_work(); - - /* Parallel compilation of Cycles kernels, this launches multiple - * processes to workaround OpenCL frameworks serializing the calls - * internally within a single process. */ - foreach(OpenCLProgram *program, programs) { - if (!program->load()) { - load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); - } - } - return true; + VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << "."; + /* Verify if device was initialized. */ + if (!device_initialized) { + fprintf(stderr, "OpenCL: failed to initialize device.\n"); + return false; + } + + /* Verify we have right opencl version. */ + if (!opencl_version_check()) + return false; + + load_required_kernels(requested_features); + + vector<OpenCLProgram *> programs; + kernel_programs.load_kernels(programs, requested_features, false); + + if (!requested_features.use_baking && requested_features.use_denoising) { + denoising_program = OpenCLProgram( + this, "denoising", "filter.cl", get_build_options(requested_features, "denoising")); + denoising_program.add_kernel(ustring("filter_divide_shadow")); + denoising_program.add_kernel(ustring("filter_get_feature")); + denoising_program.add_kernel(ustring("filter_write_feature")); + denoising_program.add_kernel(ustring("filter_detect_outliers")); + denoising_program.add_kernel(ustring("filter_combine_halves")); + denoising_program.add_kernel(ustring("filter_construct_transform")); + denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); + denoising_program.add_kernel(ustring("filter_nlm_blur")); + denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); + denoising_program.add_kernel(ustring("filter_nlm_update_output")); + denoising_program.add_kernel(ustring("filter_nlm_normalize")); + denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); + denoising_program.add_kernel(ustring("filter_finalize")); + programs.push_back(&denoising_program); + } + + load_required_kernel_task_pool.wait_work(); + + /* Parallel compilation of Cycles kernels, this launches multiple + * processes to workaround OpenCL frameworks serializing the calls + * internally within a single process. */ + foreach (OpenCLProgram *program, programs) { + if (!program->load()) { + load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); + } + } + return true; } -void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures& requested_features) +void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features) { - vector<OpenCLProgram*> programs; - base_program = OpenCLProgram(this, "base", "kernel_base.cl", get_build_options(requested_features, "base")); - base_program.add_kernel(ustring("convert_to_byte")); - base_program.add_kernel(ustring("convert_to_half_float")); - base_program.add_kernel(ustring("zero_buffer")); - programs.push_back(&base_program); - - if (requested_features.use_true_displacement) { - displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace")); - displace_program.add_kernel(ustring("displace")); - programs.push_back(&displace_program); - } - - if (requested_features.use_background_light) { - background_program = OpenCLProgram(this, "background", "kernel_background.cl", get_build_options(requested_features, "background")); - background_program.add_kernel(ustring("background")); - programs.push_back(&background_program); - } - - if (requested_features.use_baking) { - bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake")); - bake_program.add_kernel(ustring("bake")); - programs.push_back(&bake_program); - } - - foreach(OpenCLProgram *program, programs) { - if (!program->load()) { - load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); - } - } + vector<OpenCLProgram *> programs; + base_program = OpenCLProgram( + this, "base", "kernel_base.cl", get_build_options(requested_features, "base")); + base_program.add_kernel(ustring("convert_to_byte")); + base_program.add_kernel(ustring("convert_to_half_float")); + base_program.add_kernel(ustring("zero_buffer")); + programs.push_back(&base_program); + + if (requested_features.use_true_displacement) { + displace_program = OpenCLProgram( + this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace")); + displace_program.add_kernel(ustring("displace")); + programs.push_back(&displace_program); + } + + if (requested_features.use_background_light) { + background_program = OpenCLProgram(this, + "background", + "kernel_background.cl", + get_build_options(requested_features, "background")); + background_program.add_kernel(ustring("background")); + programs.push_back(&background_program); + } + + if (requested_features.use_baking) { + bake_program = OpenCLProgram( + this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake")); + bake_program.add_kernel(ustring("bake")); + programs.push_back(&bake_program); + } + + foreach (OpenCLProgram *program, programs) { + if (!program->load()) { + load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); + } + } } void OpenCLDevice::load_preview_kernels() { - DeviceRequestedFeatures no_features; - vector<OpenCLProgram*> programs; - preview_programs.load_kernels(programs, no_features, true); - - foreach(OpenCLProgram *program, programs) { - if (!program->load()) { - load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); - } - } + DeviceRequestedFeatures no_features; + vector<OpenCLProgram *> programs; + preview_programs.load_kernels(programs, no_features, true); + + foreach (OpenCLProgram *program, programs) { + if (!program->load()) { + load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); + } + } } -bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures& requested_features) +bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features) { - if (background) { - load_kernel_task_pool.wait_work(); - use_preview_kernels = false; - } - else { - /* We use a device setting to determine to load preview kernels or not - * Better to check on device level than per kernel as mixing preview and - * non-preview kernels does not work due to different data types */ - if (use_preview_kernels) { - use_preview_kernels = !load_kernel_task_pool.finished(); - } - } - return split_kernel->load_kernels(requested_features); + if (background) { + load_kernel_task_pool.wait_work(); + use_preview_kernels = false; + } + else { + /* We use a device setting to determine to load preview kernels or not + * Better to check on device level than per kernel as mixing preview and + * non-preview kernels does not work due to different data types */ + if (use_preview_kernels) { + use_preview_kernels = !load_kernel_task_pool.finished(); + } + } + return split_kernel->load_kernels(requested_features); } -OpenCLDevice::OpenCLSplitPrograms* OpenCLDevice::get_split_programs() +OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs() { - return use_preview_kernels?&preview_programs:&kernel_programs; + return use_preview_kernels ? &preview_programs : &kernel_programs; } DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state() { - /* Do not switch kernels for background renderings - * We do foreground rendering but use the preview kernels - * Check for the optimized kernels - * - * This works also the other way around, where we are using - * optimized kernels but new ones are being compiled due - * to other features that are needed */ - if (background) { - /* The if-statements below would find the same result, - * But as the `finished` method uses a mutex we added - * this as an early exit */ - return DEVICE_KERNEL_USING_FEATURE_KERNEL; - } - - bool other_kernels_finished = load_kernel_task_pool.finished(); - if (use_preview_kernels) { - if (other_kernels_finished) { - return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE; - } - else { - return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL; - } - } - else { - if (other_kernels_finished) { - return DEVICE_KERNEL_USING_FEATURE_KERNEL; - } - else { - return DEVICE_KERNEL_FEATURE_KERNEL_INVALID; - } - } + /* Do not switch kernels for background renderings + * We do foreground rendering but use the preview kernels + * Check for the optimized kernels + * + * This works also the other way around, where we are using + * optimized kernels but new ones are being compiled due + * to other features that are needed */ + if (background) { + /* The if-statements below would find the same result, + * But as the `finished` method uses a mutex we added + * this as an early exit */ + return DEVICE_KERNEL_USING_FEATURE_KERNEL; + } + + bool other_kernels_finished = load_kernel_task_pool.finished(); + if (use_preview_kernels) { + if (other_kernels_finished) { + return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE; + } + else { + return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL; + } + } + else { + if (other_kernels_finished) { + return DEVICE_KERNEL_USING_FEATURE_KERNEL; + } + else { + return DEVICE_KERNEL_FEATURE_KERNEL_INVALID; + } + } } -void OpenCLDevice::mem_alloc(device_memory& mem) +void OpenCLDevice::mem_alloc(device_memory &mem) { - if(mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - size_t size = mem.memory_size(); - - /* check there is enough memory available for the allocation */ - cl_ulong max_alloc_size = 0; - clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); - - if(DebugFlags().opencl.mem_limit) { - max_alloc_size = min(max_alloc_size, - cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); - } - - if(size > max_alloc_size) { - string error = "Scene too complex to fit in available memory."; - if(mem.name != NULL) { - error += string_printf(" (allocating buffer %s failed.)", mem.name); - } - set_error(error); - - return; - } - - cl_mem_flags mem_flag; - void *mem_ptr = NULL; - - if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - /* Zero-size allocation might be invoked by render, but not really - * supported by OpenCL. Using NULL as device pointer also doesn't really - * work for some reason, so for the time being we'll use special case - * will null_mem buffer. - */ - if(size != 0) { - mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, - mem_flag, - size, - mem_ptr, - &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer"); - } - else { - mem.device_pointer = null_mem; - } - - stats.mem_alloc(size); - mem.device_size = size; + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + size_t size = mem.memory_size(); + + /* check there is enough memory available for the allocation */ + cl_ulong max_alloc_size = 0; + clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); + + if (DebugFlags().opencl.mem_limit) { + max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); + } + + if (size > max_alloc_size) { + string error = "Scene too complex to fit in available memory."; + if (mem.name != NULL) { + error += string_printf(" (allocating buffer %s failed.)", mem.name); + } + set_error(error); + + return; + } + + cl_mem_flags mem_flag; + void *mem_ptr = NULL; + + if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) + mem_flag = CL_MEM_READ_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + /* Zero-size allocation might be invoked by render, but not really + * supported by OpenCL. Using NULL as device pointer also doesn't really + * work for some reason, so for the time being we'll use special case + * will null_mem buffer. + */ + if (size != 0) { + mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer"); + } + else { + mem.device_pointer = null_mem; + } + + stats.mem_alloc(size); + mem.device_size = size; } -void OpenCLDevice::mem_copy_to(device_memory& mem) +void OpenCLDevice::mem_copy_to(device_memory &mem) { - if(mem.type == MEM_TEXTURE) { - tex_free(mem); - tex_alloc(mem); - } - else { - if(!mem.device_pointer) { - mem_alloc(mem); - } - - /* this is blocking */ - size_t size = mem.memory_size(); - if(size != 0) { - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - size, - mem.host_pointer, - 0, - NULL, NULL)); - } - } + if (mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if (!mem.device_pointer) { + mem_alloc(mem); + } + + /* this is blocking */ + size_t size = mem.memory_size(); + if (size != 0) { + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + size, + mem.host_pointer, + 0, + NULL, + NULL)); + } + } } -void OpenCLDevice::mem_copy_from(device_memory& mem, int y, int w, int h, int elem) +void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) { - size_t offset = elem*y*w; - size_t size = elem*w*h; - assert(size != 0); - opencl_assert(clEnqueueReadBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - offset, - size, - (uchar*)mem.host_pointer + offset, - 0, - NULL, NULL)); + size_t offset = elem * y * w; + size_t size = elem * w * h; + assert(size != 0); + opencl_assert(clEnqueueReadBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + offset, + size, + (uchar *)mem.host_pointer + offset, + 0, + NULL, + NULL)); } void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size) { - base_program.wait_for_availability(); - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; - - cl_mem d_buffer = CL_MEM_PTR(mem); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; - - while(d_offset < size) { - d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset); - - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, - ckZeroBuffer, - 2, - NULL, - global_size, - NULL, - 0, - NULL, - NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - - d_offset += d_size; - } + base_program.wait_for_availability(); + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; + + cl_mem d_buffer = CL_MEM_PTR(mem); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; + + while (d_offset < size) { + d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset); + + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + + ciErr = clEnqueueNDRangeKernel( + cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + + d_offset += d_size; + } } -void OpenCLDevice::mem_zero(device_memory& mem) +void OpenCLDevice::mem_zero(device_memory &mem) { - if(!mem.device_pointer) { - mem_alloc(mem); - } - - if(mem.device_pointer) { - if(base_program.is_loaded()) { - mem_zero_kernel(mem.device_pointer, mem.memory_size()); - } - - if(mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } - - if(!base_program.is_loaded()) { - void* zero = mem.host_pointer; - - if(!mem.host_pointer) { - zero = util_aligned_malloc(mem.memory_size(), 16); - memset(zero, 0, mem.memory_size()); - } - - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - mem.memory_size(), - zero, - 0, - NULL, NULL)); - - if(!mem.host_pointer) { - util_aligned_free(zero); - } - } - } + if (!mem.device_pointer) { + mem_alloc(mem); + } + + if (mem.device_pointer) { + if (base_program.is_loaded()) { + mem_zero_kernel(mem.device_pointer, mem.memory_size()); + } + + if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } + + if (!base_program.is_loaded()) { + void *zero = mem.host_pointer; + + if (!mem.host_pointer) { + zero = util_aligned_malloc(mem.memory_size(), 16); + memset(zero, 0, mem.memory_size()); + } + + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + mem.memory_size(), + zero, + 0, + NULL, + NULL)); + + if (!mem.host_pointer) { + util_aligned_free(zero); + } + } + } } -void OpenCLDevice::mem_free(device_memory& mem) +void OpenCLDevice::mem_free(device_memory &mem) { - if(mem.type == MEM_TEXTURE) { - tex_free(mem); - } - else { - if(mem.device_pointer) { - if(mem.device_pointer != null_mem) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); - } - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } + if (mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + if (mem.device_pointer) { + if (mem.device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); + } + mem.device_pointer = 0; + + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } } int OpenCLDevice::mem_sub_ptr_alignment() { - return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); + return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); } -device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory& mem, int offset, int size) +device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size) { - cl_mem_flags mem_flag; - if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - cl_buffer_region info; - info.origin = mem.memory_elements_size(offset); - info.size = mem.memory_elements_size(size); - - device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), - mem_flag, - CL_BUFFER_CREATE_TYPE_REGION, - &info, - &ciErr); - opencl_assert_err(ciErr, "clCreateSubBuffer"); - return sub_buf; + cl_mem_flags mem_flag; + if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) + mem_flag = CL_MEM_READ_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + cl_buffer_region info; + info.origin = mem.memory_elements_size(offset); + info.size = mem.memory_elements_size(size); + + device_ptr sub_buf = (device_ptr)clCreateSubBuffer( + CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr); + opencl_assert_err(ciErr, "clCreateSubBuffer"); + return sub_buf; } void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer) { - if(device_pointer && device_pointer != null_mem) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); - } + if (device_pointer && device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); + } } void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size) { - ConstMemMap::iterator i = const_mem_map.find(name); - device_vector<uchar> *data; - - if(i == const_mem_map.end()) { - data = new device_vector<uchar>(this, name, MEM_READ_ONLY); - data->alloc(size); - const_mem_map.insert(ConstMemMap::value_type(name, data)); - } - else { - data = i->second; - } - - memcpy(data->data(), host, size); - data->copy_to_device(); + ConstMemMap::iterator i = const_mem_map.find(name); + device_vector<uchar> *data; + + if (i == const_mem_map.end()) { + data = new device_vector<uchar>(this, name, MEM_READ_ONLY); + data->alloc(size); + const_mem_map.insert(ConstMemMap::value_type(name, data)); + } + else { + data = i->second; + } + + memcpy(data->data(), host, size); + data->copy_to_device(); } -void OpenCLDevice::tex_alloc(device_memory& mem) +void OpenCLDevice::tex_alloc(device_memory &mem) { - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - memory_manager.alloc(mem.name, mem); - /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ - mem.device_pointer = 1; - textures[mem.name] = &mem; - textures_need_update = true; + VLOG(1) << "Texture allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + memory_manager.alloc(mem.name, mem); + /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ + mem.device_pointer = 1; + textures[mem.name] = &mem; + textures_need_update = true; } -void OpenCLDevice::tex_free(device_memory& mem) +void OpenCLDevice::tex_free(device_memory &mem) { - if(mem.device_pointer) { - mem.device_pointer = 0; - - if(memory_manager.free(mem)) { - textures_need_update = true; - } - - foreach(TexturesMap::value_type& value, textures) { - if(value.second == &mem) { - textures.erase(value.first); - break; - } - } - } + if (mem.device_pointer) { + mem.device_pointer = 0; + + if (memory_manager.free(mem)) { + textures_need_update = true; + } + + foreach (TexturesMap::value_type &value, textures) { + if (value.second == &mem) { + textures.erase(value.first); + break; + } + } + } } size_t OpenCLDevice::global_size_round_up(int group_size, int global_size) { - int r = global_size % group_size; - return global_size + ((r == 0)? 0: group_size - r); + int r = global_size % group_size; + return global_size + ((r == 0) ? 0 : group_size - r); } -void OpenCLDevice::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) +void OpenCLDevice::enqueue_kernel( + cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) { - size_t workgroup_size, max_work_items[3]; - - clGetKernelWorkGroupInfo(kernel, cdDevice, - CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); - clGetDeviceInfo(cdDevice, - CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); - - if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { - workgroup_size = max_workgroup_size; - } - - /* Try to divide evenly over 2 dimensions. */ - size_t local_size[2]; - if(x_workgroups) { - local_size[0] = workgroup_size; - local_size[1] = 1; - } - else { - size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); - local_size[0] = local_size[1] = sqrt_workgroup_size; - } - - /* Some implementations have max size 1 on 2nd dimension. */ - if(local_size[1] > max_work_items[1]) { - local_size[0] = workgroup_size/max_work_items[1]; - local_size[1] = max_work_items[1]; - } - - size_t global_size[2] = {global_size_round_up(local_size[0], w), - global_size_round_up(local_size[1], h)}; - - /* Vertical size of 1 is coming from bake/shade kernels where we should - * not round anything up because otherwise we'll either be doing too - * much work per pixel (if we don't check global ID on Y axis) or will - * be checking for global ID to always have Y of 0. - */ - if(h == 1) { - global_size[h] = 1; - } - - /* run kernel */ - opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); - opencl_assert(clFlush(cqCommandQueue)); + size_t workgroup_size, max_work_items[3]; + + clGetKernelWorkGroupInfo( + kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); + clGetDeviceInfo( + cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL); + + if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { + workgroup_size = max_workgroup_size; + } + + /* Try to divide evenly over 2 dimensions. */ + size_t local_size[2]; + if (x_workgroups) { + local_size[0] = workgroup_size; + local_size[1] = 1; + } + else { + size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); + local_size[0] = local_size[1] = sqrt_workgroup_size; + } + + /* Some implementations have max size 1 on 2nd dimension. */ + if (local_size[1] > max_work_items[1]) { + local_size[0] = workgroup_size / max_work_items[1]; + local_size[1] = max_work_items[1]; + } + + size_t global_size[2] = {global_size_round_up(local_size[0], w), + global_size_round_up(local_size[1], h)}; + + /* Vertical size of 1 is coming from bake/shade kernels where we should + * not round anything up because otherwise we'll either be doing too + * much work per pixel (if we don't check global ID on Y axis) or will + * be checking for global ID to always have Y of 0. + */ + if (h == 1) { + global_size[h] = 1; + } + + /* run kernel */ + opencl_assert( + clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); + opencl_assert(clFlush(cqCommandQueue)); } void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name) { - cl_mem ptr; - - MemMap::iterator i = mem_map.find(name); - if(i != mem_map.end()) { - ptr = CL_MEM_PTR(i->second); - } - else { - /* work around NULL not working, even though the spec says otherwise */ - ptr = CL_MEM_PTR(null_mem); - } - - opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr)); + cl_mem ptr; + + MemMap::iterator i = mem_map.find(name); + if (i != mem_map.end()) { + ptr = CL_MEM_PTR(i->second); + } + else { + /* work around NULL not working, even though the spec says otherwise */ + ptr = CL_MEM_PTR(null_mem); + } + + opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr)); } void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) { - flush_texture_buffers(); + flush_texture_buffers(); - memory_manager.set_kernel_arg_buffers(kernel, narg); + memory_manager.set_kernel_arg_buffers(kernel, narg); } void OpenCLDevice::flush_texture_buffers() { - if(!textures_need_update) { - return; - } - textures_need_update = false; - - /* Setup slots for textures. */ - int num_slots = 0; - - vector<texture_slot_t> texture_slots; - -#define KERNEL_TEX(type, name) \ - if(textures.find(#name) != textures.end()) { \ - texture_slots.push_back(texture_slot_t(#name, num_slots)); \ - } \ - num_slots++; -#include "kernel/kernel_textures.h" - - int num_data_slots = num_slots; - - foreach(TexturesMap::value_type& tex, textures) { - string name = tex.first; - - if(string_startswith(name, "__tex_image")) { - int pos = name.rfind("_"); - int id = atoi(name.data() + pos + 1); - texture_slots.push_back(texture_slot_t(name, - num_data_slots + id)); - num_slots = max(num_slots, num_data_slots + id + 1); - } - } - - /* Realloc texture descriptors buffer. */ - memory_manager.free(texture_info); - texture_info.resize(num_slots); - memory_manager.alloc("texture_info", texture_info); - - /* Fill in descriptors */ - foreach(texture_slot_t& slot, texture_slots) { - TextureInfo& info = texture_info[slot.slot]; - - MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); - info.data = desc.offset; - info.cl_buffer = desc.device_buffer; - - if(string_startswith(slot.name, "__tex_image")) { - device_memory *mem = textures[slot.name]; - - info.width = mem->data_width; - info.height = mem->data_height; - info.depth = mem->data_depth; - - info.interpolation = mem->interpolation; - info.extension = mem->extension; - } - } - - /* Force write of descriptors. */ - memory_manager.free(texture_info); - memory_manager.alloc("texture_info", texture_info); -} + if (!textures_need_update) { + return; + } + textures_need_update = false; + + /* Setup slots for textures. */ + int num_slots = 0; + + vector<texture_slot_t> texture_slots; + +# define KERNEL_TEX(type, name) \ + if (textures.find(#name) != textures.end()) { \ + texture_slots.push_back(texture_slot_t(#name, num_slots)); \ + } \ + num_slots++; +# include "kernel/kernel_textures.h" + + int num_data_slots = num_slots; + + foreach (TexturesMap::value_type &tex, textures) { + string name = tex.first; + + if (string_startswith(name, "__tex_image")) { + int pos = name.rfind("_"); + int id = atoi(name.data() + pos + 1); + texture_slots.push_back(texture_slot_t(name, num_data_slots + id)); + num_slots = max(num_slots, num_data_slots + id + 1); + } + } + + /* Realloc texture descriptors buffer. */ + memory_manager.free(texture_info); + texture_info.resize(num_slots); + memory_manager.alloc("texture_info", texture_info); + + /* Fill in descriptors */ + foreach (texture_slot_t &slot, texture_slots) { + TextureInfo &info = texture_info[slot.slot]; + + MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); + info.data = desc.offset; + info.cl_buffer = desc.device_buffer; + if (string_startswith(slot.name, "__tex_image")) { + device_memory *mem = textures[slot.name]; + + info.width = mem->data_width; + info.height = mem->data_height; + info.depth = mem->data_depth; + + info.interpolation = mem->interpolation; + info.extension = mem->extension; + } + } + + /* Force write of descriptors. */ + memory_manager.free(texture_info); + memory_manager.alloc("texture_info", texture_info); +} void OpenCLDevice::thread_run(DeviceTask *task) { - flush_texture_buffers(); - - if(task->type == DeviceTask::FILM_CONVERT) { - film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); - } - else if(task->type == DeviceTask::SHADER) { - shader(*task); - } - else if(task->type == DeviceTask::RENDER) { - RenderTile tile; - DenoisingTask denoising(this, *task); - - /* Allocate buffer for kernel globals */ - device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); - kgbuffer.alloc_to_device(1); - - /* Keep rendering tiles until done. */ - while(task->acquire_tile(this, tile)) { - if(tile.task == RenderTile::PATH_TRACE) { - assert(tile.task == RenderTile::PATH_TRACE); - scoped_timer timer(&tile.buffers->render_time); - - split_kernel->path_trace(task, - tile, - kgbuffer, - *const_mem_map["__data"]); - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - } - else if(tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - denoise(tile, denoising); - task->update_progress(&tile, tile.w*tile.h); - } - - task->release_tile(tile); - } - - kgbuffer.free(); - } + flush_texture_buffers(); + + if (task->type == DeviceTask::FILM_CONVERT) { + film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); + } + else if (task->type == DeviceTask::SHADER) { + shader(*task); + } + else if (task->type == DeviceTask::RENDER) { + RenderTile tile; + DenoisingTask denoising(this, *task); + + /* Allocate buffer for kernel globals */ + device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); + kgbuffer.alloc_to_device(1); + + /* Keep rendering tiles until done. */ + while (task->acquire_tile(this, tile)) { + if (tile.task == RenderTile::PATH_TRACE) { + assert(tile.task == RenderTile::PATH_TRACE); + scoped_timer timer(&tile.buffers->render_time); + + split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]); + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if (tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, denoising); + task->update_progress(&tile, tile.w * tile.h); + } + + task->release_tile(tile); + } + + kgbuffer.free(); + } } -void OpenCLDevice::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) +void OpenCLDevice::film_convert(DeviceTask &task, + device_ptr buffer, + device_ptr rgba_byte, + device_ptr rgba_half) { - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half); - cl_mem d_buffer = CL_MEM_PTR(buffer); - cl_int d_x = task.x; - cl_int d_y = task.y; - cl_int d_w = task.w; - cl_int d_h = task.h; - cl_float d_sample_scale = 1.0f/(task.sample + 1); - cl_int d_offset = task.offset; - cl_int d_stride = task.stride; - - - cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float")); - - cl_uint start_arg_index = - kernel_set_args(ckFilmConvertKernel, - 0, - d_data, - d_rgba, - d_buffer); - - set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); - - start_arg_index += kernel_set_args(ckFilmConvertKernel, - start_arg_index, - d_sample_scale, - d_x, - d_y, - d_w, - d_h, - d_offset, - d_stride); - - enqueue_kernel(ckFilmConvertKernel, d_w, d_h); + /* cast arguments to cl types */ + cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); + cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half); + cl_mem d_buffer = CL_MEM_PTR(buffer); + cl_int d_x = task.x; + cl_int d_y = task.y; + cl_int d_w = task.w; + cl_int d_h = task.h; + cl_float d_sample_scale = 1.0f / (task.sample + 1); + cl_int d_offset = task.offset; + cl_int d_stride = task.stride; + + cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) : + base_program(ustring("convert_to_half_float")); + + cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer); + + set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); + + start_arg_index += kernel_set_args(ckFilmConvertKernel, + start_arg_index, + d_sample_scale, + d_x, + d_y, + d_w, + d_h, + d_offset, + d_stride); + + enqueue_kernel(ckFilmConvertKernel, d_w, d_h); } bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr, @@ -1419,123 +1406,119 @@ bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr, device_ptr out_ptr, DenoisingTask *task) { - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); - device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); - device_sub_ptr weightAccum(task->buffer.temporary_mem, 2*pass_stride*num_shifts, pass_stride); - cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem guide_mem = CL_MEM_PTR(guide_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem out_mem = CL_MEM_PTR(out_ptr); - cl_mem scale_mem = NULL; - - mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride); - mem_zero_kernel(out_ptr, sizeof(float)*pass_stride); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); - cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); - - kernel_set_args(ckNLMCalcDifference, 0, - guide_mem, - variance_mem, - scale_mem, - difference_mem, - w, h, stride, - pass_stride, - r, channel_offset, - 0, a, k_2); - kernel_set_args(ckNLMBlur, 0, - difference_mem, - blurDifference_mem, - w, h, stride, - pass_stride, - r, f); - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference_mem, - difference_mem, - w, h, stride, - pass_stride, - r, f); - kernel_set_args(ckNLMUpdateOutput, 0, - blurDifference_mem, - image_mem, - out_mem, - weightAccum_mem, - w, h, stride, - pass_stride, - channel_offset, - r, f); - - enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true); - - kernel_set_args(ckNLMNormalize, 0, - out_mem, weightAccum_mem, w, h, stride); - enqueue_kernel(ckNLMNormalize, w, h); - - return true; + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; + + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts); + device_sub_ptr blurDifference( + task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts); + device_sub_ptr weightAccum( + task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride); + cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); + + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem out_mem = CL_MEM_PTR(out_ptr); + cl_mem scale_mem = NULL; + + mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride); + mem_zero_kernel(out_ptr, sizeof(float) * pass_stride); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); + cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); + + kernel_set_args(ckNLMCalcDifference, + 0, + guide_mem, + variance_mem, + scale_mem, + difference_mem, + w, + h, + stride, + pass_stride, + r, + channel_offset, + 0, + a, + k_2); + kernel_set_args( + ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f); + kernel_set_args( + ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f); + kernel_set_args(ckNLMUpdateOutput, + 0, + blurDifference_mem, + image_mem, + out_mem, + weightAccum_mem, + w, + h, + stride, + pass_stride, + channel_offset, + r, + f); + + enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); + enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true); + + kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride); + enqueue_kernel(ckNLMNormalize, w, h); + + return true; } bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task) { - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - char use_time = task->buffer.use_time? 1 : 0; - - cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - - int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, - buffer_mem, - tile_info_mem); - cl_mem buffers[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterConstructTransform, - arg_ofs, - buffers[i]); - } - kernel_set_args(ckFilterConstructTransform, - arg_ofs, - transform_mem, - rank_mem, - task->filter_area, - task->rect, - task->buffer.pass_stride, - task->buffer.frame_stride, - use_time, - task->radius, - task->pca_threshold); - - enqueue_kernel(ckFilterConstructTransform, - task->storage.w, - task->storage.h, - 256); - - return true; + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + char use_time = task->buffer.use_time ? 1 : 0; + + cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); + + int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem); + cl_mem buffers[9]; + for (int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]); + } + kernel_set_args(ckFilterConstructTransform, + arg_ofs, + transform_mem, + rank_mem, + task->filter_area, + task->rect, + task->buffer.pass_stride, + task->buffer.frame_stride, + use_time, + task->radius, + task->pca_threshold); + + enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256); + + return true; } bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr, @@ -1544,136 +1527,130 @@ bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr, int frame, DenoisingTask *task) { - cl_mem color_mem = CL_MEM_PTR(color_ptr); - cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); - cl_mem scale_mem = CL_MEM_PTR(scale_ptr); - - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - char use_time = task->buffer.use_time? 1 : 0; - - int r = task->radius; - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); - device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - kernel_set_args(ckNLMCalcDifference, 0, - color_mem, - color_variance_mem, - scale_mem, - difference_mem, - w, h, stride, - pass_stride, - r, - pass_stride, - frame_offset, - 1.0f, task->nlm_k_2); - kernel_set_args(ckNLMBlur, 0, - difference_mem, - blurDifference_mem, - w, h, stride, - pass_stride, - r, 4); - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference_mem, - difference_mem, - w, h, stride, - pass_stride, - r, 4); - kernel_set_args(ckNLMConstructGramian, 0, - t, - blurDifference_mem, - buffer_mem, - transform_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->reconstruction_state.filter_window, - w, h, stride, - pass_stride, - r, 4, - frame_offset, - use_time); - - enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); - - return true; + cl_mem color_mem = CL_MEM_PTR(color_ptr); + cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); + cl_mem scale_mem = CL_MEM_PTR(scale_ptr); + + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + char use_time = task->buffer.use_time ? 1 : 0; + + int r = task->radius; + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts); + device_sub_ptr blurDifference( + task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); + + kernel_set_args(ckNLMCalcDifference, + 0, + color_mem, + color_variance_mem, + scale_mem, + difference_mem, + w, + h, + stride, + pass_stride, + r, + pass_stride, + frame_offset, + 1.0f, + task->nlm_k_2); + kernel_set_args( + ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4); + kernel_set_args( + ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4); + kernel_set_args(ckNLMConstructGramian, + 0, + t, + blurDifference_mem, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->reconstruction_state.filter_window, + w, + h, + stride, + pass_stride, + r, + 4, + frame_offset, + use_time); + + enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); + enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256); + + return true; } -bool OpenCLDevice::denoising_solve(device_ptr output_ptr, - DenoisingTask *task) +bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task) { - cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); - - cl_mem output_mem = CL_MEM_PTR(output_ptr); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - - kernel_set_args(ckFinalize, 0, - output_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->filter_area, - task->reconstruction_state.buffer_params, - task->render_buffer.samples); - enqueue_kernel(ckFinalize, w, h); - - return true; + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem output_mem = CL_MEM_PTR(output_ptr); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + + kernel_set_args(ckFinalize, + 0, + output_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->filter_area, + task->reconstruction_state.buffer_params, + task->render_buffer.samples); + enqueue_kernel(ckFinalize, w, h); + + return true; } bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, device_ptr variance_ptr, - int r, int4 rect, + int r, + int4 rect, DenoisingTask *task) { - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); - - kernel_set_args(ckFilterCombineHalves, 0, - mean_mem, - variance_mem, - a_mem, - b_mem, - rect, - r); - enqueue_kernel(ckFilterCombineHalves, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); + + kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r); + enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + return true; } bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr, @@ -1683,39 +1660,36 @@ bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr, device_ptr buffer_variance_ptr, DenoisingTask *task) { - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); - cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); - cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); - - int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0, - task->render_buffer.samples, - tile_info_mem); - cl_mem buffers[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, - buffers[i]); - } - kernel_set_args(ckFilterDivideShadow, arg_ofs, - a_mem, - b_mem, - sample_variance_mem, - sv_variance_mem, - buffer_variance_mem, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterDivideShadow, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); + cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); + cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); + + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); + + int arg_ofs = kernel_set_args( + ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem); + cl_mem buffers[9]; + for (int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]); + } + kernel_set_args(ckFilterDivideShadow, + arg_ofs, + a_mem, + b_mem, + sample_variance_mem, + sv_variance_mem, + buffer_variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.offset); + enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + return true; } bool OpenCLDevice::denoising_get_feature(int mean_offset, @@ -1725,36 +1699,32 @@ bool OpenCLDevice::denoising_get_feature(int mean_offset, float scale, DenoisingTask *task) { - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); - - int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, - task->render_buffer.samples, - tile_info_mem); - cl_mem buffers[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, - buffers[i]); - } - kernel_set_args(ckFilterGetFeature, arg_ofs, - mean_offset, - variance_offset, - mean_mem, - variance_mem, - scale, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterGetFeature, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); + + int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem); + cl_mem buffers[9]; + for (int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]); + } + kernel_set_args(ckFilterGetFeature, + arg_ofs, + mean_offset, + variance_offset, + mean_mem, + variance_mem, + scale, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.offset); + enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + return true; } bool OpenCLDevice::denoising_write_feature(int out_offset, @@ -1762,24 +1732,23 @@ bool OpenCLDevice::denoising_write_feature(int out_offset, device_ptr buffer_ptr, DenoisingTask *task) { - cl_mem from_mem = CL_MEM_PTR(from_ptr); - cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); - - cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); - - kernel_set_args(ckFilterWriteFeature, 0, - task->render_buffer.samples, - task->reconstruction_state.buffer_params, - task->filter_area, - from_mem, - buffer_mem, - out_offset, - task->rect); - enqueue_kernel(ckFilterWriteFeature, - task->filter_area.z, - task->filter_area.w); - - return true; + cl_mem from_mem = CL_MEM_PTR(from_ptr); + cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); + + cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); + + kernel_set_args(ckFilterWriteFeature, + 0, + task->render_buffer.samples, + task->reconstruction_state.buffer_params, + task->filter_area, + from_mem, + buffer_mem, + out_offset, + task->rect); + enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w); + + return true; } bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr, @@ -1788,155 +1757,155 @@ bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr, device_ptr output_ptr, DenoisingTask *task) { - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem depth_mem = CL_MEM_PTR(depth_ptr); - cl_mem output_mem = CL_MEM_PTR(output_ptr); - - cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); - - kernel_set_args(ckFilterDetectOutliers, 0, - image_mem, - variance_mem, - depth_mem, - output_mem, - task->rect, - task->buffer.pass_stride); - enqueue_kernel(ckFilterDetectOutliers, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem depth_mem = CL_MEM_PTR(depth_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); + + kernel_set_args(ckFilterDetectOutliers, + 0, + image_mem, + variance_mem, + depth_mem, + output_mem, + task->rect, + task->buffer.pass_stride); + enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + return true; } -void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask& denoising) +void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising) { - denoising.functions.construct_transform = function_bind(&OpenCLDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind(&OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind(&OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind(&OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind(&OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind(&OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(&rtile); + denoising.functions.construct_transform = function_bind( + &OpenCLDevice::denoising_construct_transform, this, &denoising); + denoising.functions.accumulate = function_bind( + &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising); + denoising.functions.divide_shadow = function_bind( + &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind( + &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind( + &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind( + &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind( + &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); + denoising.functions.detect_outliers = function_bind( + &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + denoising.buffer.gpu_temporary_mem = true; + + denoising.run_denoising(&rtile); } -void OpenCLDevice::shader(DeviceTask& task) +void OpenCLDevice::shader(DeviceTask &task) { - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_input = CL_MEM_PTR(task.shader_input); - cl_mem d_output = CL_MEM_PTR(task.shader_output); - cl_int d_shader_eval_type = task.shader_eval_type; - cl_int d_shader_filter = task.shader_filter; - cl_int d_shader_x = task.shader_x; - cl_int d_shader_w = task.shader_w; - cl_int d_offset = task.offset; - - OpenCLDevice::OpenCLProgram *program = &background_program; - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - program = &bake_program; - } - else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { - program = &displace_program; - } - program->wait_for_availability(); - cl_kernel kernel = (*program)(); - - cl_uint start_arg_index = - kernel_set_args(kernel, - 0, - d_data, - d_input, - d_output); - - set_kernel_arg_buffers(kernel, &start_arg_index); - - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_eval_type); - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_filter); - } - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_x, - d_shader_w, - d_offset); - - for(int sample = 0; sample < task.num_samples; sample++) { - - if(task.get_cancel()) - break; - - kernel_set_args(kernel, start_arg_index, sample); - - enqueue_kernel(kernel, task.shader_w, 1); - - clFinish(cqCommandQueue); - - task.update_progress(NULL); - } + /* cast arguments to cl types */ + cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); + cl_mem d_input = CL_MEM_PTR(task.shader_input); + cl_mem d_output = CL_MEM_PTR(task.shader_output); + cl_int d_shader_eval_type = task.shader_eval_type; + cl_int d_shader_filter = task.shader_filter; + cl_int d_shader_x = task.shader_x; + cl_int d_shader_w = task.shader_w; + cl_int d_offset = task.offset; + + OpenCLDevice::OpenCLProgram *program = &background_program; + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + program = &bake_program; + } + else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { + program = &displace_program; + } + program->wait_for_availability(); + cl_kernel kernel = (*program)(); + + cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output); + + set_kernel_arg_buffers(kernel, &start_arg_index); + + start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type); + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter); + } + start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset); + + for (int sample = 0; sample < task.num_samples; sample++) { + + if (task.get_cancel()) + break; + + kernel_set_args(kernel, start_arg_index, sample); + + enqueue_kernel(kernel, task.shader_w, 1); + + clFinish(cqCommandQueue); + + task.update_progress(NULL); + } } string OpenCLDevice::kernel_build_options(const string *debug_src) { - string build_options = "-cl-no-signed-zeros -cl-mad-enable "; - - if(platform_name == "NVIDIA CUDA") { - build_options += "-D__KERNEL_OPENCL_NVIDIA__ " - "-cl-nv-maxrregcount=32 " - "-cl-nv-verbose "; - - uint compute_capability_major, compute_capability_minor; - clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, - sizeof(cl_uint), &compute_capability_major, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, - sizeof(cl_uint), &compute_capability_minor, NULL); - - build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ", - compute_capability_major * 100 + - compute_capability_minor * 10); - } - - else if(platform_name == "Apple") - build_options += "-D__KERNEL_OPENCL_APPLE__ "; - - else if(platform_name == "AMD Accelerated Parallel Processing") - build_options += "-D__KERNEL_OPENCL_AMD__ "; - - else if(platform_name == "Intel(R) OpenCL") { - build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; - - /* Options for gdb source level kernel debugging. - * this segfaults on linux currently. - */ - if(OpenCLInfo::use_debug() && debug_src) - build_options += "-g -s \"" + *debug_src + "\" "; - } - - if(info.has_half_images) { - build_options += "-D__KERNEL_CL_KHR_FP16__ "; - } - - if(OpenCLInfo::use_debug()) { - build_options += "-D__KERNEL_OPENCL_DEBUG__ "; - } - -#ifdef WITH_CYCLES_DEBUG - build_options += "-D__KERNEL_DEBUG__ "; -#endif - - return build_options; + string build_options = "-cl-no-signed-zeros -cl-mad-enable "; + + if (platform_name == "NVIDIA CUDA") { + build_options += + "-D__KERNEL_OPENCL_NVIDIA__ " + "-cl-nv-maxrregcount=32 " + "-cl-nv-verbose "; + + uint compute_capability_major, compute_capability_minor; + clGetDeviceInfo(cdDevice, + CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), + &compute_capability_major, + NULL); + clGetDeviceInfo(cdDevice, + CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), + &compute_capability_minor, + NULL); + + build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ", + compute_capability_major * 100 + compute_capability_minor * 10); + } + + else if (platform_name == "Apple") + build_options += "-D__KERNEL_OPENCL_APPLE__ "; + + else if (platform_name == "AMD Accelerated Parallel Processing") + build_options += "-D__KERNEL_OPENCL_AMD__ "; + + else if (platform_name == "Intel(R) OpenCL") { + build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; + + /* Options for gdb source level kernel debugging. + * this segfaults on linux currently. + */ + if (OpenCLInfo::use_debug() && debug_src) + build_options += "-g -s \"" + *debug_src + "\" "; + } + + if (info.has_half_images) { + build_options += "-D__KERNEL_CL_KHR_FP16__ "; + } + + if (OpenCLInfo::use_debug()) { + build_options += "-D__KERNEL_OPENCL_DEBUG__ "; + } + +# ifdef WITH_CYCLES_DEBUG + build_options += "-D__KERNEL_DEBUG__ "; +# endif + + return build_options; } /* TODO(sergey): In the future we can use variadic templates, once @@ -1944,137 +1913,130 @@ string OpenCLDevice::kernel_build_options(const string *debug_src) */ int OpenCLDevice::kernel_set_args(cl_kernel kernel, int start_argument_index, - const ArgumentWrapper& arg1, - const ArgumentWrapper& arg2, - const ArgumentWrapper& arg3, - const ArgumentWrapper& arg4, - const ArgumentWrapper& arg5, - const ArgumentWrapper& arg6, - const ArgumentWrapper& arg7, - const ArgumentWrapper& arg8, - const ArgumentWrapper& arg9, - const ArgumentWrapper& arg10, - const ArgumentWrapper& arg11, - const ArgumentWrapper& arg12, - const ArgumentWrapper& arg13, - const ArgumentWrapper& arg14, - const ArgumentWrapper& arg15, - const ArgumentWrapper& arg16, - const ArgumentWrapper& arg17, - const ArgumentWrapper& arg18, - const ArgumentWrapper& arg19, - const ArgumentWrapper& arg20, - const ArgumentWrapper& arg21, - const ArgumentWrapper& arg22, - const ArgumentWrapper& arg23, - const ArgumentWrapper& arg24, - const ArgumentWrapper& arg25, - const ArgumentWrapper& arg26, - const ArgumentWrapper& arg27, - const ArgumentWrapper& arg28, - const ArgumentWrapper& arg29, - const ArgumentWrapper& arg30, - const ArgumentWrapper& arg31, - const ArgumentWrapper& arg32, - const ArgumentWrapper& arg33) + const ArgumentWrapper &arg1, + const ArgumentWrapper &arg2, + const ArgumentWrapper &arg3, + const ArgumentWrapper &arg4, + const ArgumentWrapper &arg5, + const ArgumentWrapper &arg6, + const ArgumentWrapper &arg7, + const ArgumentWrapper &arg8, + const ArgumentWrapper &arg9, + const ArgumentWrapper &arg10, + const ArgumentWrapper &arg11, + const ArgumentWrapper &arg12, + const ArgumentWrapper &arg13, + const ArgumentWrapper &arg14, + const ArgumentWrapper &arg15, + const ArgumentWrapper &arg16, + const ArgumentWrapper &arg17, + const ArgumentWrapper &arg18, + const ArgumentWrapper &arg19, + const ArgumentWrapper &arg20, + const ArgumentWrapper &arg21, + const ArgumentWrapper &arg22, + const ArgumentWrapper &arg23, + const ArgumentWrapper &arg24, + const ArgumentWrapper &arg25, + const ArgumentWrapper &arg26, + const ArgumentWrapper &arg27, + const ArgumentWrapper &arg28, + const ArgumentWrapper &arg29, + const ArgumentWrapper &arg30, + const ArgumentWrapper &arg31, + const ArgumentWrapper &arg32, + const ArgumentWrapper &arg33) { - int current_arg_index = 0; -#define FAKE_VARARG_HANDLE_ARG(arg) \ - do { \ - if(arg.pointer != NULL) { \ - opencl_assert(clSetKernelArg( \ - kernel, \ - start_argument_index + current_arg_index, \ - arg.size, arg.pointer)); \ - ++current_arg_index; \ - } \ - else { \ - return current_arg_index; \ - } \ - } while(false) - FAKE_VARARG_HANDLE_ARG(arg1); - FAKE_VARARG_HANDLE_ARG(arg2); - FAKE_VARARG_HANDLE_ARG(arg3); - FAKE_VARARG_HANDLE_ARG(arg4); - FAKE_VARARG_HANDLE_ARG(arg5); - FAKE_VARARG_HANDLE_ARG(arg6); - FAKE_VARARG_HANDLE_ARG(arg7); - FAKE_VARARG_HANDLE_ARG(arg8); - FAKE_VARARG_HANDLE_ARG(arg9); - FAKE_VARARG_HANDLE_ARG(arg10); - FAKE_VARARG_HANDLE_ARG(arg11); - FAKE_VARARG_HANDLE_ARG(arg12); - FAKE_VARARG_HANDLE_ARG(arg13); - FAKE_VARARG_HANDLE_ARG(arg14); - FAKE_VARARG_HANDLE_ARG(arg15); - FAKE_VARARG_HANDLE_ARG(arg16); - FAKE_VARARG_HANDLE_ARG(arg17); - FAKE_VARARG_HANDLE_ARG(arg18); - FAKE_VARARG_HANDLE_ARG(arg19); - FAKE_VARARG_HANDLE_ARG(arg20); - FAKE_VARARG_HANDLE_ARG(arg21); - FAKE_VARARG_HANDLE_ARG(arg22); - FAKE_VARARG_HANDLE_ARG(arg23); - FAKE_VARARG_HANDLE_ARG(arg24); - FAKE_VARARG_HANDLE_ARG(arg25); - FAKE_VARARG_HANDLE_ARG(arg26); - FAKE_VARARG_HANDLE_ARG(arg27); - FAKE_VARARG_HANDLE_ARG(arg28); - FAKE_VARARG_HANDLE_ARG(arg29); - FAKE_VARARG_HANDLE_ARG(arg30); - FAKE_VARARG_HANDLE_ARG(arg31); - FAKE_VARARG_HANDLE_ARG(arg32); - FAKE_VARARG_HANDLE_ARG(arg33); -#undef FAKE_VARARG_HANDLE_ARG - return current_arg_index; + int current_arg_index = 0; +# define FAKE_VARARG_HANDLE_ARG(arg) \ + do { \ + if (arg.pointer != NULL) { \ + opencl_assert(clSetKernelArg( \ + kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \ + ++current_arg_index; \ + } \ + else { \ + return current_arg_index; \ + } \ + } while (false) + FAKE_VARARG_HANDLE_ARG(arg1); + FAKE_VARARG_HANDLE_ARG(arg2); + FAKE_VARARG_HANDLE_ARG(arg3); + FAKE_VARARG_HANDLE_ARG(arg4); + FAKE_VARARG_HANDLE_ARG(arg5); + FAKE_VARARG_HANDLE_ARG(arg6); + FAKE_VARARG_HANDLE_ARG(arg7); + FAKE_VARARG_HANDLE_ARG(arg8); + FAKE_VARARG_HANDLE_ARG(arg9); + FAKE_VARARG_HANDLE_ARG(arg10); + FAKE_VARARG_HANDLE_ARG(arg11); + FAKE_VARARG_HANDLE_ARG(arg12); + FAKE_VARARG_HANDLE_ARG(arg13); + FAKE_VARARG_HANDLE_ARG(arg14); + FAKE_VARARG_HANDLE_ARG(arg15); + FAKE_VARARG_HANDLE_ARG(arg16); + FAKE_VARARG_HANDLE_ARG(arg17); + FAKE_VARARG_HANDLE_ARG(arg18); + FAKE_VARARG_HANDLE_ARG(arg19); + FAKE_VARARG_HANDLE_ARG(arg20); + FAKE_VARARG_HANDLE_ARG(arg21); + FAKE_VARARG_HANDLE_ARG(arg22); + FAKE_VARARG_HANDLE_ARG(arg23); + FAKE_VARARG_HANDLE_ARG(arg24); + FAKE_VARARG_HANDLE_ARG(arg25); + FAKE_VARARG_HANDLE_ARG(arg26); + FAKE_VARARG_HANDLE_ARG(arg27); + FAKE_VARARG_HANDLE_ARG(arg28); + FAKE_VARARG_HANDLE_ARG(arg29); + FAKE_VARARG_HANDLE_ARG(arg30); + FAKE_VARARG_HANDLE_ARG(arg31); + FAKE_VARARG_HANDLE_ARG(arg32); + FAKE_VARARG_HANDLE_ARG(arg33); +# undef FAKE_VARARG_HANDLE_ARG + return current_arg_index; } void OpenCLDevice::release_kernel_safe(cl_kernel kernel) { - if(kernel) { - clReleaseKernel(kernel); - } + if (kernel) { + clReleaseKernel(kernel); + } } void OpenCLDevice::release_mem_object_safe(cl_mem mem) { - if(mem != NULL) { - clReleaseMemObject(mem); - } + if (mem != NULL) { + clReleaseMemObject(mem); + } } void OpenCLDevice::release_program_safe(cl_program program) { - if(program) { - clReleaseProgram(program); - } + if (program) { + clReleaseProgram(program); + } } /* ** Those guys are for workign around some compiler-specific bugs ** */ -cl_program OpenCLDevice::load_cached_kernel(ustring key, - thread_scoped_lock& cache_locker) +cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker) { - return OpenCLCache::get_program(cpPlatform, - cdDevice, - key, - cache_locker); + return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker); } void OpenCLDevice::store_cached_kernel(cl_program program, ustring key, - thread_scoped_lock& cache_locker) + thread_scoped_lock &cache_locker) { - OpenCLCache::store_program(cpPlatform, - cdDevice, - program, - key, - cache_locker); + OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker); } -Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background) +Device *opencl_create_split_device(DeviceInfo &info, + Stats &stats, + Profiler &profiler, + bool background) { - return new OpenCLDevice(info, stats, profiler, background); + return new OpenCLDevice(info, stats, profiler, background); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 5a1e12af8ab..cc40ad42b06 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -16,1059 +16,1017 @@ #ifdef WITH_OPENCL -#include "device/opencl/opencl.h" -#include "device/device_intern.h" +# include "device/opencl/opencl.h" +# include "device/device_intern.h" -#include "util/util_debug.h" -#include "util/util_logging.h" -#include "util/util_md5.h" -#include "util/util_path.h" -#include "util/util_time.h" -#include "util/util_system.h" +# include "util/util_debug.h" +# include "util/util_logging.h" +# include "util/util_md5.h" +# include "util/util_path.h" +# include "util/util_time.h" +# include "util/util_system.h" using std::cerr; using std::endl; CCL_NAMESPACE_BEGIN -OpenCLCache::Slot::ProgramEntry::ProgramEntry() - : program(NULL), - mutex(NULL) +OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL) { } -OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry& rhs) - : program(rhs.program), - mutex(NULL) +OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs) + : program(rhs.program), mutex(NULL) { } OpenCLCache::Slot::ProgramEntry::~ProgramEntry() { - delete mutex; + delete mutex; } -OpenCLCache::Slot::Slot() - : context_mutex(NULL), - context(NULL) +OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL) { } -OpenCLCache::Slot::Slot(const Slot& rhs) - : context_mutex(NULL), - context(NULL), - programs(rhs.programs) +OpenCLCache::Slot::Slot(const Slot &rhs) + : context_mutex(NULL), context(NULL), programs(rhs.programs) { } OpenCLCache::Slot::~Slot() { - delete context_mutex; + delete context_mutex; } -OpenCLCache& OpenCLCache::global_instance() +OpenCLCache &OpenCLCache::global_instance() { - static OpenCLCache instance; - return instance; + static OpenCLCache instance; + return instance; } cl_context OpenCLCache::get_context(cl_platform_id platform, cl_device_id device, - thread_scoped_lock& slot_locker) + thread_scoped_lock &slot_locker) { - assert(platform != NULL); + assert(platform != NULL); - OpenCLCache& self = global_instance(); + OpenCLCache &self = global_instance(); - thread_scoped_lock cache_lock(self.cache_lock); + thread_scoped_lock cache_lock(self.cache_lock); - pair<CacheMap::iterator,bool> ins = self.cache.insert( - CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); + pair<CacheMap::iterator, bool> ins = self.cache.insert( + CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); - Slot &slot = ins.first->second; + Slot &slot = ins.first->second; - /* create slot lock only while holding cache lock */ - if(!slot.context_mutex) - slot.context_mutex = new thread_mutex; + /* create slot lock only while holding cache lock */ + if (!slot.context_mutex) + slot.context_mutex = new thread_mutex; - /* need to unlock cache before locking slot, to allow store to complete */ - cache_lock.unlock(); + /* need to unlock cache before locking slot, to allow store to complete */ + cache_lock.unlock(); - /* lock the slot */ - slot_locker = thread_scoped_lock(*slot.context_mutex); + /* lock the slot */ + slot_locker = thread_scoped_lock(*slot.context_mutex); - /* If the thing isn't cached */ - if(slot.context == NULL) { - /* return with the caller's lock holder holding the slot lock */ - return NULL; - } + /* If the thing isn't cached */ + if (slot.context == NULL) { + /* return with the caller's lock holder holding the slot lock */ + return NULL; + } - /* the item was already cached, release the slot lock */ - slot_locker.unlock(); + /* the item was already cached, release the slot lock */ + slot_locker.unlock(); - cl_int ciErr = clRetainContext(slot.context); - assert(ciErr == CL_SUCCESS); - (void) ciErr; + cl_int ciErr = clRetainContext(slot.context); + assert(ciErr == CL_SUCCESS); + (void)ciErr; - return slot.context; + return slot.context; } cl_program OpenCLCache::get_program(cl_platform_id platform, cl_device_id device, ustring key, - thread_scoped_lock& slot_locker) + thread_scoped_lock &slot_locker) { - assert(platform != NULL); + assert(platform != NULL); - OpenCLCache& self = global_instance(); + OpenCLCache &self = global_instance(); - thread_scoped_lock cache_lock(self.cache_lock); + thread_scoped_lock cache_lock(self.cache_lock); - pair<CacheMap::iterator,bool> ins = self.cache.insert( - CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); + pair<CacheMap::iterator, bool> ins = self.cache.insert( + CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); - Slot &slot = ins.first->second; + Slot &slot = ins.first->second; - pair<Slot::EntryMap::iterator,bool> ins2 = slot.programs.insert( - Slot::EntryMap::value_type(key, Slot::ProgramEntry())); + pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert( + Slot::EntryMap::value_type(key, Slot::ProgramEntry())); - Slot::ProgramEntry &entry = ins2.first->second; + Slot::ProgramEntry &entry = ins2.first->second; - /* create slot lock only while holding cache lock */ - if(!entry.mutex) - entry.mutex = new thread_mutex; + /* create slot lock only while holding cache lock */ + if (!entry.mutex) + entry.mutex = new thread_mutex; - /* need to unlock cache before locking slot, to allow store to complete */ - cache_lock.unlock(); + /* need to unlock cache before locking slot, to allow store to complete */ + cache_lock.unlock(); - /* lock the slot */ - slot_locker = thread_scoped_lock(*entry.mutex); + /* lock the slot */ + slot_locker = thread_scoped_lock(*entry.mutex); - /* If the thing isn't cached */ - if(entry.program == NULL) { - /* return with the caller's lock holder holding the slot lock */ - return NULL; - } + /* If the thing isn't cached */ + if (entry.program == NULL) { + /* return with the caller's lock holder holding the slot lock */ + return NULL; + } - /* the item was already cached, release the slot lock */ - slot_locker.unlock(); + /* the item was already cached, release the slot lock */ + slot_locker.unlock(); - cl_int ciErr = clRetainProgram(entry.program); - assert(ciErr == CL_SUCCESS); - (void) ciErr; + cl_int ciErr = clRetainProgram(entry.program); + assert(ciErr == CL_SUCCESS); + (void)ciErr; - return entry.program; + return entry.program; } void OpenCLCache::store_context(cl_platform_id platform, cl_device_id device, cl_context context, - thread_scoped_lock& slot_locker) + thread_scoped_lock &slot_locker) { - assert(platform != NULL); - assert(device != NULL); - assert(context != NULL); + assert(platform != NULL); + assert(device != NULL); + assert(context != NULL); - OpenCLCache &self = global_instance(); + OpenCLCache &self = global_instance(); - thread_scoped_lock cache_lock(self.cache_lock); - CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); - cache_lock.unlock(); + thread_scoped_lock cache_lock(self.cache_lock); + CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); + cache_lock.unlock(); - Slot &slot = i->second; + Slot &slot = i->second; - /* sanity check */ - assert(i != self.cache.end()); - assert(slot.context == NULL); + /* sanity check */ + assert(i != self.cache.end()); + assert(slot.context == NULL); - slot.context = context; + slot.context = context; - /* unlock the slot */ - slot_locker.unlock(); + /* unlock the slot */ + slot_locker.unlock(); - /* increment reference count in OpenCL. - * The caller is going to release the object when done with it. */ - cl_int ciErr = clRetainContext(context); - assert(ciErr == CL_SUCCESS); - (void) ciErr; + /* increment reference count in OpenCL. + * The caller is going to release the object when done with it. */ + cl_int ciErr = clRetainContext(context); + assert(ciErr == CL_SUCCESS); + (void)ciErr; } void OpenCLCache::store_program(cl_platform_id platform, cl_device_id device, cl_program program, ustring key, - thread_scoped_lock& slot_locker) + thread_scoped_lock &slot_locker) { - assert(platform != NULL); - assert(device != NULL); - assert(program != NULL); + assert(platform != NULL); + assert(device != NULL); + assert(program != NULL); - OpenCLCache &self = global_instance(); + OpenCLCache &self = global_instance(); - thread_scoped_lock cache_lock(self.cache_lock); + thread_scoped_lock cache_lock(self.cache_lock); - CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); - assert(i != self.cache.end()); - Slot &slot = i->second; + CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); + assert(i != self.cache.end()); + Slot &slot = i->second; - Slot::EntryMap::iterator i2 = slot.programs.find(key); - assert(i2 != slot.programs.end()); - Slot::ProgramEntry &entry = i2->second; + Slot::EntryMap::iterator i2 = slot.programs.find(key); + assert(i2 != slot.programs.end()); + Slot::ProgramEntry &entry = i2->second; - assert(entry.program == NULL); + assert(entry.program == NULL); - cache_lock.unlock(); + cache_lock.unlock(); - entry.program = program; + entry.program = program; - /* unlock the slot */ - slot_locker.unlock(); + /* unlock the slot */ + slot_locker.unlock(); - /* Increment reference count in OpenCL. - * The caller is going to release the object when done with it. - */ - cl_int ciErr = clRetainProgram(program); - assert(ciErr == CL_SUCCESS); - (void) ciErr; + /* Increment reference count in OpenCL. + * The caller is going to release the object when done with it. + */ + cl_int ciErr = clRetainProgram(program); + assert(ciErr == CL_SUCCESS); + (void)ciErr; } string OpenCLCache::get_kernel_md5() { - OpenCLCache &self = global_instance(); - thread_scoped_lock lock(self.kernel_md5_lock); + OpenCLCache &self = global_instance(); + thread_scoped_lock lock(self.kernel_md5_lock); - if(self.kernel_md5.empty()) { - self.kernel_md5 = path_files_md5_hash(path_get("source")); - } - return self.kernel_md5; + if (self.kernel_md5.empty()) { + self.kernel_md5 = path_files_md5_hash(path_get("source")); + } + return self.kernel_md5; } -static string get_program_source(const string& kernel_file) +static string get_program_source(const string &kernel_file) { - string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; - /* We compile kernels consisting of many files. unfortunately OpenCL - * kernel caches do not seem to recognize changes in included files. - * so we force recompile on changes by adding the md5 hash of all files. - */ - source = path_source_replace_includes(source, path_get("source")); - source += "\n// " + util_md5_string(source) + "\n"; - return source; + string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; + /* We compile kernels consisting of many files. unfortunately OpenCL + * kernel caches do not seem to recognize changes in included files. + * so we force recompile on changes by adding the md5 hash of all files. + */ + source = path_source_replace_includes(source, path_get("source")); + source += "\n// " + util_md5_string(source) + "\n"; + return source; } OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device, - const string& program_name, - const string& kernel_file, - const string& kernel_build_options, - bool use_stdout) - : device(device), - program_name(program_name), - kernel_file(kernel_file), - kernel_build_options(kernel_build_options), - use_stdout(use_stdout) + const string &program_name, + const string &kernel_file, + const string &kernel_build_options, + bool use_stdout) + : device(device), + program_name(program_name), + kernel_file(kernel_file), + kernel_build_options(kernel_build_options), + use_stdout(use_stdout) { - loaded = false; - needs_compiling = true; - program = NULL; + loaded = false; + needs_compiling = true; + program = NULL; } OpenCLDevice::OpenCLProgram::~OpenCLProgram() { - release(); + release(); } void OpenCLDevice::OpenCLProgram::release() { - for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) { - if(kernel->second) { - clReleaseKernel(kernel->second); - kernel->second = NULL; - } - } - if(program) { - clReleaseProgram(program); - program = NULL; - } + for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); + ++kernel) { + if (kernel->second) { + clReleaseKernel(kernel->second); + kernel->second = NULL; + } + } + if (program) { + clReleaseProgram(program); + program = NULL; + } } -void OpenCLDevice::OpenCLProgram::add_log(const string& msg, bool debug) +void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug) { - if(!use_stdout) { - log += msg + "\n"; - } - else if(!debug) { - printf("%s\n", msg.c_str()); - fflush(stdout); - } - else { - VLOG(2) << msg; - } + if (!use_stdout) { + log += msg + "\n"; + } + else if (!debug) { + printf("%s\n", msg.c_str()); + fflush(stdout); + } + else { + VLOG(2) << msg; + } } -void OpenCLDevice::OpenCLProgram::add_error(const string& msg) +void OpenCLDevice::OpenCLProgram::add_error(const string &msg) { - if(use_stdout) { - fprintf(stderr, "%s\n", msg.c_str()); - } - if(error_msg == "") { - error_msg += "\n"; - } - error_msg += msg; + if (use_stdout) { + fprintf(stderr, "%s\n", msg.c_str()); + } + if (error_msg == "") { + error_msg += "\n"; + } + error_msg += msg; } void OpenCLDevice::OpenCLProgram::add_kernel(ustring name) { - if(!kernels.count(name)) { - kernels[name] = NULL; - } + if (!kernels.count(name)) { + kernels[name] = NULL; + } } bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src) { - string build_options; - build_options = device->kernel_build_options(debug_src) + kernel_build_options; + string build_options; + build_options = device->kernel_build_options(debug_src) + kernel_build_options; - VLOG(1) << "Build options passed to clBuildProgram: '" - << build_options << "'."; - cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); + VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'."; + cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); - /* show warnings even if build is successful */ - size_t ret_val_size = 0; + /* show warnings even if build is successful */ + size_t ret_val_size = 0; - clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); + clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); - if(ciErr != CL_SUCCESS) { - add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) + ", errors in console."); - } + if (ciErr != CL_SUCCESS) { + add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) + + ", errors in console."); + } - if(ret_val_size > 1) { - vector<char> build_log(ret_val_size + 1); - clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL); + if (ret_val_size > 1) { + vector<char> build_log(ret_val_size + 1); + clGetProgramBuildInfo( + program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL); - build_log[ret_val_size] = '\0'; - /* Skip meaningless empty output from the NVidia compiler. */ - if(!(ret_val_size == 2 && build_log[0] == '\n')) { - add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]), ciErr == CL_SUCCESS); - } - } + build_log[ret_val_size] = '\0'; + /* Skip meaningless empty output from the NVidia compiler. */ + if (!(ret_val_size == 2 && build_log[0] == '\n')) { + add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]), + ciErr == CL_SUCCESS); + } + } - return (ciErr == CL_SUCCESS); + return (ciErr == CL_SUCCESS); } bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src) { - string source = get_program_source(kernel_file); + string source = get_program_source(kernel_file); - if(debug_src) { - path_write_text(*debug_src, source); - } + if (debug_src) { + path_write_text(*debug_src, source); + } - size_t source_len = source.size(); - const char *source_str = source.c_str(); - cl_int ciErr; + size_t source_len = source.size(); + const char *source_str = source.c_str(); + cl_int ciErr; - program = clCreateProgramWithSource(device->cxContext, - 1, - &source_str, - &source_len, - &ciErr); + program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr); - if(ciErr != CL_SUCCESS) { - add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr)); - return false; - } + if (ciErr != CL_SUCCESS) { + add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr)); + return false; + } - double starttime = time_dt(); - add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); - add_log(string("Build flags: ") + kernel_build_options, true); + double starttime = time_dt(); + add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); + add_log(string("Build flags: ") + kernel_build_options, true); - if(!build_kernel(debug_src)) - return false; + if (!build_kernel(debug_src)) + return false; - double elapsed = time_dt() - starttime; - add_log(string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), false); + double elapsed = time_dt() - starttime; + add_log( + string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), + false); - return true; + return true; } -static void escape_python_string(string& str) +static void escape_python_string(string &str) { - /* Escape string to be passed as a Python raw string with '' quotes'. */ - string_replace(str, "'", "\'"); + /* Escape string to be passed as a Python raw string with '' quotes'. */ + string_replace(str, "'", "\'"); } -bool OpenCLDevice::OpenCLProgram::compile_separate(const string& clbin) +bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin) { - vector<string> args; - args.push_back("--background"); - args.push_back("--factory-startup"); - args.push_back("--python-expr"); - - int device_platform_id = device->device_num; - string device_name = device->device_name; - string platform_name = device->platform_name; - string build_options = device->kernel_build_options(NULL) + kernel_build_options; - string kernel_file_escaped = kernel_file; - string clbin_escaped = clbin; - - escape_python_string(device_name); - escape_python_string(platform_name); - escape_python_string(build_options); - escape_python_string(kernel_file_escaped); - escape_python_string(clbin_escaped); - - args.push_back( - string_printf( - "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')", - device_platform_id, - device_name.c_str(), - platform_name.c_str(), - build_options.c_str(), - kernel_file_escaped.c_str(), - clbin_escaped.c_str())); - - double starttime = time_dt(); - add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); - add_log(string("Build flags: ") + kernel_build_options, true); - if(!system_call_self(args) || !path_exists(clbin)) { - return false; - } - - double elapsed = time_dt() - starttime; - add_log(string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), false); - - return load_binary(clbin); + vector<string> args; + args.push_back("--background"); + args.push_back("--factory-startup"); + args.push_back("--python-expr"); + + int device_platform_id = device->device_num; + string device_name = device->device_name; + string platform_name = device->platform_name; + string build_options = device->kernel_build_options(NULL) + kernel_build_options; + string kernel_file_escaped = kernel_file; + string clbin_escaped = clbin; + + escape_python_string(device_name); + escape_python_string(platform_name); + escape_python_string(build_options); + escape_python_string(kernel_file_escaped); + escape_python_string(clbin_escaped); + + args.push_back(string_printf( + "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')", + device_platform_id, + device_name.c_str(), + platform_name.c_str(), + build_options.c_str(), + kernel_file_escaped.c_str(), + clbin_escaped.c_str())); + + double starttime = time_dt(); + add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); + add_log(string("Build flags: ") + kernel_build_options, true); + if (!system_call_self(args) || !path_exists(clbin)) { + return false; + } + + double elapsed = time_dt() - starttime; + add_log( + string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), + false); + + return load_binary(clbin); } /* Compile opencl kernel. This method is called from the _cycles Python * module compile kernels. Parameters must match function above. */ -bool device_opencl_compile_kernel(const vector<string>& parameters) +bool device_opencl_compile_kernel(const vector<string> ¶meters) { - int device_platform_id = std::stoi(parameters[0]); - const string& device_name = parameters[1]; - const string& platform_name = parameters[2]; - const string& build_options = parameters[3]; - const string& kernel_file = parameters[4]; - const string& binary_path = parameters[5]; - - if(clewInit() != CLEW_SUCCESS) { - return false; - } - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - if(device_platform_id >= usable_devices.size()) { - return false; - } - - OpenCLPlatformDevice& platform_device = usable_devices[device_platform_id]; - if(platform_device.platform_name != platform_name || - platform_device.device_name != device_name) - { - return false; - } - - cl_platform_id platform = platform_device.platform_id; - cl_device_id device = platform_device.device_id; - const cl_context_properties context_props[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties) platform, - 0, 0 - }; - - cl_int err; - cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err); - if(err != CL_SUCCESS) { - return false; - } - - string source = get_program_source(kernel_file); - size_t source_len = source.size(); - const char *source_str = source.c_str(); - cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err); - bool result = false; - - if(err == CL_SUCCESS) { - err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); - - if(err == CL_SUCCESS) { - size_t size = 0; - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - if(size > 0) { - vector<uint8_t> binary(size); - uint8_t *bytes = &binary[0]; - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL); - result = path_write_binary(binary_path, binary); - } - } - clReleaseProgram(program); - } - - clReleaseContext(context); - - return result; + int device_platform_id = std::stoi(parameters[0]); + const string &device_name = parameters[1]; + const string &platform_name = parameters[2]; + const string &build_options = parameters[3]; + const string &kernel_file = parameters[4]; + const string &binary_path = parameters[5]; + + if (clewInit() != CLEW_SUCCESS) { + return false; + } + + vector<OpenCLPlatformDevice> usable_devices; + OpenCLInfo::get_usable_devices(&usable_devices); + if (device_platform_id >= usable_devices.size()) { + return false; + } + + OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id]; + if (platform_device.platform_name != platform_name || + platform_device.device_name != device_name) { + return false; + } + + cl_platform_id platform = platform_device.platform_id; + cl_device_id device = platform_device.device_id; + const cl_context_properties context_props[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0}; + + cl_int err; + cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err); + if (err != CL_SUCCESS) { + return false; + } + + string source = get_program_source(kernel_file); + size_t source_len = source.size(); + const char *source_str = source.c_str(); + cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err); + bool result = false; + + if (err == CL_SUCCESS) { + err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); + + if (err == CL_SUCCESS) { + size_t size = 0; + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); + if (size > 0) { + vector<uint8_t> binary(size); + uint8_t *bytes = &binary[0]; + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL); + result = path_write_binary(binary_path, binary); + } + } + clReleaseProgram(program); + } + + clReleaseContext(context); + + return result; } -bool OpenCLDevice::OpenCLProgram::load_binary(const string& clbin, - const string *debug_src) +bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src) { - /* read binary into memory */ - vector<uint8_t> binary; + /* read binary into memory */ + vector<uint8_t> binary; - if(!path_read_binary(clbin, binary)) { - add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str())); - return false; - } + if (!path_read_binary(clbin, binary)) { + add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str())); + return false; + } - /* create program */ - cl_int status, ciErr; - size_t size = binary.size(); - const uint8_t *bytes = &binary[0]; + /* create program */ + cl_int status, ciErr; + size_t size = binary.size(); + const uint8_t *bytes = &binary[0]; - program = clCreateProgramWithBinary(device->cxContext, 1, &device->cdDevice, - &size, &bytes, &status, &ciErr); + program = clCreateProgramWithBinary( + device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr); - if(status != CL_SUCCESS || ciErr != CL_SUCCESS) { - add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " - + clewErrorString(status) + " " + clewErrorString(ciErr)); - return false; - } + if (status != CL_SUCCESS || ciErr != CL_SUCCESS) { + add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " + + clewErrorString(status) + " " + clewErrorString(ciErr)); + return false; + } - if(!build_kernel(debug_src)) - return false; + if (!build_kernel(debug_src)) + return false; - return true; + return true; } -bool OpenCLDevice::OpenCLProgram::save_binary(const string& clbin) +bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin) { - size_t size = 0; - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); + size_t size = 0; + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - if(!size) - return false; + if (!size) + return false; - vector<uint8_t> binary(size); - uint8_t *bytes = &binary[0]; + vector<uint8_t> binary(size); + uint8_t *bytes = &binary[0]; - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL); + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL); - return path_write_binary(clbin, binary); + return path_write_binary(clbin, binary); } bool OpenCLDevice::OpenCLProgram::load() { - loaded = false; - string device_md5 = device->device_md5_hash(kernel_build_options); - - /* Try to use cached kernel. */ - thread_scoped_lock cache_locker; - ustring cache_key(program_name + device_md5); - program = device->load_cached_kernel(cache_key, - cache_locker); - if (!program) { - add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - - /* need to create source to get md5 */ - string source = get_program_source(kernel_file); - - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source); - basename = path_cache_get(path_join("kernels", basename)); - string clbin = basename + ".clbin"; - - /* If binary kernel exists already, try use it. */ - if(path_exists(clbin) && load_binary(clbin)) { - /* Kernel loaded from binary, nothing to do. */ - add_log(string("Loaded program from ") + clbin + ".", true); - - /* Cache the program. */ - device->store_cached_kernel(program, - cache_key, - cache_locker); - } - else { - add_log(string("OpenCL program ") + program_name + " not found on disk.", true); - cache_locker.unlock(); - } - } - - if (program) { - create_kernels(); - loaded = true; - needs_compiling = false; - } - - return loaded; + loaded = false; + string device_md5 = device->device_md5_hash(kernel_build_options); + + /* Try to use cached kernel. */ + thread_scoped_lock cache_locker; + ustring cache_key(program_name + device_md5); + program = device->load_cached_kernel(cache_key, cache_locker); + if (!program) { + add_log(string("OpenCL program ") + program_name + " not found in cache.", true); + + /* need to create source to get md5 */ + string source = get_program_source(kernel_file); + + string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + + util_md5_string(source); + basename = path_cache_get(path_join("kernels", basename)); + string clbin = basename + ".clbin"; + + /* If binary kernel exists already, try use it. */ + if (path_exists(clbin) && load_binary(clbin)) { + /* Kernel loaded from binary, nothing to do. */ + add_log(string("Loaded program from ") + clbin + ".", true); + + /* Cache the program. */ + device->store_cached_kernel(program, cache_key, cache_locker); + } + else { + add_log(string("OpenCL program ") + program_name + " not found on disk.", true); + cache_locker.unlock(); + } + } + + if (program) { + create_kernels(); + loaded = true; + needs_compiling = false; + } + + return loaded; } void OpenCLDevice::OpenCLProgram::compile() { - assert(device); - - string device_md5 = device->device_md5_hash(kernel_build_options); - - /* Try to use cached kernel. */ - thread_scoped_lock cache_locker; - ustring cache_key(program_name + device_md5); - program = device->load_cached_kernel(cache_key, - cache_locker); - - if (!program) - { - - add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - - /* need to create source to get md5 */ - string source = get_program_source(kernel_file); - - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source); - basename = path_cache_get(path_join("kernels", basename)); - string clbin = basename + ".clbin"; - - /* path to preprocessed source for debugging */ - string clsrc, *debug_src = NULL; - - if(OpenCLInfo::use_debug()) { - clsrc = basename + ".cl"; - debug_src = &clsrc; - } - - /* If binary kernel exists already, try use it. */ - if(compile_separate(clbin)) { - add_log(string("Built and loaded program from ") + clbin + ".", true); - loaded = true; - } - else { - add_log(string("Separate-process building of ") + clbin + " failed, will fall back to regular building.", true); - - /* If does not exist or loading binary failed, compile kernel. */ - if(!compile_kernel(debug_src)) { - needs_compiling = false; - return; - } - - /* Save binary for reuse. */ - if(!save_binary(clbin)) { - add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true); - } - } - - /* Cache the program. */ - device->store_cached_kernel(program, - cache_key, - cache_locker); - } - - create_kernels(); - needs_compiling = false; - loaded = true; + assert(device); + + string device_md5 = device->device_md5_hash(kernel_build_options); + + /* Try to use cached kernel. */ + thread_scoped_lock cache_locker; + ustring cache_key(program_name + device_md5); + program = device->load_cached_kernel(cache_key, cache_locker); + + if (!program) { + + add_log(string("OpenCL program ") + program_name + " not found in cache.", true); + + /* need to create source to get md5 */ + string source = get_program_source(kernel_file); + + string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + + util_md5_string(source); + basename = path_cache_get(path_join("kernels", basename)); + string clbin = basename + ".clbin"; + + /* path to preprocessed source for debugging */ + string clsrc, *debug_src = NULL; + + if (OpenCLInfo::use_debug()) { + clsrc = basename + ".cl"; + debug_src = &clsrc; + } + + /* If binary kernel exists already, try use it. */ + if (compile_separate(clbin)) { + add_log(string("Built and loaded program from ") + clbin + ".", true); + loaded = true; + } + else { + add_log(string("Separate-process building of ") + clbin + + " failed, will fall back to regular building.", + true); + + /* If does not exist or loading binary failed, compile kernel. */ + if (!compile_kernel(debug_src)) { + needs_compiling = false; + return; + } + + /* Save binary for reuse. */ + if (!save_binary(clbin)) { + add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true); + } + } + + /* Cache the program. */ + device->store_cached_kernel(program, cache_key, cache_locker); + } + + create_kernels(); + needs_compiling = false; + loaded = true; } void OpenCLDevice::OpenCLProgram::create_kernels() { - for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) { - assert(kernel->second == NULL); - cl_int ciErr; - string name = "kernel_ocl_" + kernel->first.string(); - kernel->second = clCreateKernel(program, name.c_str(), &ciErr); - if(device->opencl_error(ciErr)) { - add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + clewErrorString(ciErr)); - return; - } - } + for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); + ++kernel) { + assert(kernel->second == NULL); + cl_int ciErr; + string name = "kernel_ocl_" + kernel->first.string(); + kernel->second = clCreateKernel(program, name.c_str(), &ciErr); + if (device->opencl_error(ciErr)) { + add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + + clewErrorString(ciErr)); + return; + } + } } bool OpenCLDevice::OpenCLProgram::wait_for_availability() { - add_log(string("Waiting for availability of ") + program_name + ".", true); - while (needs_compiling) { - time_sleep(0.1); - } - return loaded; + add_log(string("Waiting for availability of ") + program_name + ".", true); + while (needs_compiling) { + time_sleep(0.1); + } + return loaded; } void OpenCLDevice::OpenCLProgram::report_error() { - /* If loaded is true, there was no error. */ - if(loaded) return; - /* if use_stdout is true, the error was already reported. */ - if(use_stdout) return; - - cerr << error_msg << endl; - if(!compile_output.empty()) { - cerr << "OpenCL kernel build output for " << program_name << ":" << endl; - cerr << compile_output << endl; - } + /* If loaded is true, there was no error. */ + if (loaded) + return; + /* if use_stdout is true, the error was already reported. */ + if (use_stdout) + return; + + cerr << error_msg << endl; + if (!compile_output.empty()) { + cerr << "OpenCL kernel build output for " << program_name << ":" << endl; + cerr << compile_output << endl; + } } cl_kernel OpenCLDevice::OpenCLProgram::operator()() { - assert(kernels.size() == 1); - return kernels.begin()->second; + assert(kernels.size() == 1); + return kernels.begin()->second; } cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name) { - assert(kernels.count(name)); - return kernels[name]; + assert(kernels.count(name)); + return kernels[name]; } cl_device_type OpenCLInfo::device_type() { - switch(DebugFlags().opencl.device_type) - { - case DebugFlags::OpenCL::DEVICE_NONE: - return 0; - case DebugFlags::OpenCL::DEVICE_ALL: - return CL_DEVICE_TYPE_ALL; - case DebugFlags::OpenCL::DEVICE_DEFAULT: - return CL_DEVICE_TYPE_DEFAULT; - case DebugFlags::OpenCL::DEVICE_CPU: - return CL_DEVICE_TYPE_CPU; - case DebugFlags::OpenCL::DEVICE_GPU: - return CL_DEVICE_TYPE_GPU; - case DebugFlags::OpenCL::DEVICE_ACCELERATOR: - return CL_DEVICE_TYPE_ACCELERATOR; - default: - return CL_DEVICE_TYPE_ALL; - } + switch (DebugFlags().opencl.device_type) { + case DebugFlags::OpenCL::DEVICE_NONE: + return 0; + case DebugFlags::OpenCL::DEVICE_ALL: + return CL_DEVICE_TYPE_ALL; + case DebugFlags::OpenCL::DEVICE_DEFAULT: + return CL_DEVICE_TYPE_DEFAULT; + case DebugFlags::OpenCL::DEVICE_CPU: + return CL_DEVICE_TYPE_CPU; + case DebugFlags::OpenCL::DEVICE_GPU: + return CL_DEVICE_TYPE_GPU; + case DebugFlags::OpenCL::DEVICE_ACCELERATOR: + return CL_DEVICE_TYPE_ACCELERATOR; + default: + return CL_DEVICE_TYPE_ALL; + } } bool OpenCLInfo::use_debug() { - return DebugFlags().opencl.debug; + return DebugFlags().opencl.debug; } -bool OpenCLInfo::device_supported(const string& platform_name, - const cl_device_id device_id) +bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id) { - cl_device_type device_type; - if(!get_device_type(device_id, &device_type)) { - return false; - } - string device_name; - if(!get_device_name(device_id, &device_name)) { - return false; - } - - int driver_major = 0; - int driver_minor = 0; - if(!get_driver_version(device_id, &driver_major, &driver_minor)) { - return false; - } - VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; - - /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework - * (aka, it will not be on Intel framework). This isn't supported - * and needs an explicit blacklist. - */ - if(strstr(device_name.c_str(), "Iris")) { - return false; - } - if(platform_name == "AMD Accelerated Parallel Processing" && - device_type == CL_DEVICE_TYPE_GPU) - { - if(driver_major < 2236) { - VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; - return false; - } - const char *blacklist[] = { - /* GCN 1 */ - "Tahiti", "Pitcairn", "Capeverde", "Oland", "Hainan", - NULL - }; - for(int i = 0; blacklist[i] != NULL; i++) { - if(device_name == blacklist[i]) { - VLOG(1) << "AMD device " << device_name << " not supported"; - return false; - } - } - return true; - } - if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { - return false; - } - return false; + cl_device_type device_type; + if (!get_device_type(device_id, &device_type)) { + return false; + } + string device_name; + if (!get_device_name(device_id, &device_name)) { + return false; + } + + int driver_major = 0; + int driver_minor = 0; + if (!get_driver_version(device_id, &driver_major, &driver_minor)) { + return false; + } + VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; + + /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework + * (aka, it will not be on Intel framework). This isn't supported + * and needs an explicit blacklist. + */ + if (strstr(device_name.c_str(), "Iris")) { + return false; + } + if (platform_name == "AMD Accelerated Parallel Processing" && + device_type == CL_DEVICE_TYPE_GPU) { + if (driver_major < 2236) { + VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; + return false; + } + const char *blacklist[] = {/* GCN 1 */ + "Tahiti", + "Pitcairn", + "Capeverde", + "Oland", + "Hainan", + NULL}; + for (int i = 0; blacklist[i] != NULL; i++) { + if (device_name == blacklist[i]) { + VLOG(1) << "AMD device " << device_name << " not supported"; + return false; + } + } + return true; + } + if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { + return false; + } + return false; } -bool OpenCLInfo::platform_version_check(cl_platform_id platform, - string *error) +bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error) { - const int req_major = 1, req_minor = 1; - int major, minor; - char version[256]; - clGetPlatformInfo(platform, - CL_PLATFORM_VERSION, - sizeof(version), - &version, - NULL); - if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) { - if(error != NULL) { - *error = string_printf("OpenCL: failed to parse platform version string (%s).", version); - } - return false; - } - if(!((major == req_major && minor >= req_minor) || (major > req_major))) { - if(error != NULL) { - *error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor); - } - return false; - } - if(error != NULL) { - *error = ""; - } - return true; + const int req_major = 1, req_minor = 1; + int major, minor; + char version[256]; + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL); + if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) { + if (error != NULL) { + *error = string_printf("OpenCL: failed to parse platform version string (%s).", version); + } + return false; + } + if (!((major == req_major && minor >= req_minor) || (major > req_major))) { + if (error != NULL) { + *error = string_printf( + "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor); + } + return false; + } + if (error != NULL) { + *error = ""; + } + return true; } -bool OpenCLInfo::device_version_check(cl_device_id device, - string *error) +bool OpenCLInfo::device_version_check(cl_device_id device, string *error) { - const int req_major = 1, req_minor = 1; - int major, minor; - char version[256]; - clGetDeviceInfo(device, - CL_DEVICE_OPENCL_C_VERSION, - sizeof(version), - &version, - NULL); - if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) { - if(error != NULL) { - *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version); - } - return false; - } - if(!((major == req_major && minor >= req_minor) || (major > req_major))) { - if(error != NULL) { - *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor); - } - return false; - } - if(error != NULL) { - *error = ""; - } - return true; + const int req_major = 1, req_minor = 1; + int major, minor; + char version[256]; + clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL); + if (sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) { + if (error != NULL) { + *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version); + } + return false; + } + if (!((major == req_major && minor >= req_minor) || (major > req_major))) { + if (error != NULL) { + *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor); + } + return false; + } + if (error != NULL) { + *error = ""; + } + return true; } -string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id) +string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id) { - if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { - /* Use cl_amd_device_topology extension. */ - cl_char topology[24]; - if(clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS && topology[0] == 1) { - return string_printf("%02x:%02x.%01x", - (unsigned int)topology[21], - (unsigned int)topology[22], - (unsigned int)topology[23]); - } - } - else if(platform_name == "NVIDIA CUDA") { - /* Use two undocumented options of the cl_nv_device_attribute_query extension. */ - cl_int bus_id, slot_id; - if(clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS && - clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) { - return string_printf("%02x:%02x.%01x", - (unsigned int)(bus_id), - (unsigned int)(slot_id >> 3), - (unsigned int)(slot_id & 0x7)); - } - } - /* No general way to get a hardware ID from OpenCL => give up. */ - return ""; + if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { + /* Use cl_amd_device_topology extension. */ + cl_char topology[24]; + if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS && + topology[0] == 1) { + return string_printf("%02x:%02x.%01x", + (unsigned int)topology[21], + (unsigned int)topology[22], + (unsigned int)topology[23]); + } + } + else if (platform_name == "NVIDIA CUDA") { + /* Use two undocumented options of the cl_nv_device_attribute_query extension. */ + cl_int bus_id, slot_id; + if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS && + clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) { + return string_printf("%02x:%02x.%01x", + (unsigned int)(bus_id), + (unsigned int)(slot_id >> 3), + (unsigned int)(slot_id & 0x7)); + } + } + /* No general way to get a hardware ID from OpenCL => give up. */ + return ""; } -void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, - bool force_all) +void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all) { - const cl_device_type device_type = OpenCLInfo::device_type(); - static bool first_time = true; -#define FIRST_VLOG(severity) if(first_time) VLOG(severity) - - usable_devices->clear(); - - if(device_type == 0) { - FIRST_VLOG(2) << "OpenCL devices are forced to be disabled."; - first_time = false; - return; - } - - cl_int error; - vector<cl_device_id> device_ids; - vector<cl_platform_id> platform_ids; - - /* Get platforms. */ - if(!get_platforms(&platform_ids, &error)) { - FIRST_VLOG(2) << "Error fetching platforms:" - << string(clewErrorString(error)); - first_time = false; - return; - } - if(platform_ids.size() == 0) { - FIRST_VLOG(2) << "No OpenCL platforms were found."; - first_time = false; - return; - } - /* Devices are numbered consecutively across platforms. */ - for(int platform = 0; platform < platform_ids.size(); platform++) { - cl_platform_id platform_id = platform_ids[platform]; - string platform_name; - if(!get_platform_name(platform_id, &platform_name)) { - FIRST_VLOG(2) << "Failed to get platform name, ignoring."; - continue; - } - FIRST_VLOG(2) << "Enumerating devices for platform " - << platform_name << "."; - if(!platform_version_check(platform_id)) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << " due to too old compiler version."; - continue; - } - if(!get_platform_devices(platform_id, - device_type, - &device_ids, - &error)) - { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch of devices: " - << string(clewErrorString(error)); - continue; - } - if(device_ids.size() == 0) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", it has no devices."; - continue; - } - for(int num = 0; num < device_ids.size(); num++) { - const cl_device_id device_id = device_ids[num]; - string device_name; - if(!get_device_name(device_id, &device_name, &error)) { - FIRST_VLOG(2) << "Failed to fetch device name: " - << string(clewErrorString(error)) - << ", ignoring."; - continue; - } - if(!device_version_check(device_id)) { - FIRST_VLOG(2) << "Ignoring device " << device_name - << " due to old compiler version."; - continue; - } - if(force_all || - device_supported(platform_name, device_id)) - { - cl_device_type device_type; - if(!get_device_type(device_id, &device_type, &error)) { - FIRST_VLOG(2) << "Ignoring device " << device_name - << ", failed to fetch device type:" - << string(clewErrorString(error)); - continue; - } - string readable_device_name = - get_readable_device_name(device_id); - if(readable_device_name != device_name) { - FIRST_VLOG(2) << "Using more readable device name: " - << readable_device_name; - } - FIRST_VLOG(2) << "Adding new device " - << readable_device_name << "."; - string hardware_id = get_hardware_id(platform_name, device_id); - string device_extensions = get_device_extensions(device_id); - usable_devices->push_back(OpenCLPlatformDevice( - platform_id, - platform_name, - device_id, - device_type, - readable_device_name, - hardware_id, - device_extensions)); - } - else { - FIRST_VLOG(2) << "Ignoring device " << device_name - << ", not officially supported yet."; - } - } - } - first_time = false; + const cl_device_type device_type = OpenCLInfo::device_type(); + static bool first_time = true; +# define FIRST_VLOG(severity) \ + if (first_time) \ + VLOG(severity) + + usable_devices->clear(); + + if (device_type == 0) { + FIRST_VLOG(2) << "OpenCL devices are forced to be disabled."; + first_time = false; + return; + } + + cl_int error; + vector<cl_device_id> device_ids; + vector<cl_platform_id> platform_ids; + + /* Get platforms. */ + if (!get_platforms(&platform_ids, &error)) { + FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error)); + first_time = false; + return; + } + if (platform_ids.size() == 0) { + FIRST_VLOG(2) << "No OpenCL platforms were found."; + first_time = false; + return; + } + /* Devices are numbered consecutively across platforms. */ + for (int platform = 0; platform < platform_ids.size(); platform++) { + cl_platform_id platform_id = platform_ids[platform]; + string platform_name; + if (!get_platform_name(platform_id, &platform_name)) { + FIRST_VLOG(2) << "Failed to get platform name, ignoring."; + continue; + } + FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << "."; + if (!platform_version_check(platform_id)) { + FIRST_VLOG(2) << "Ignoring platform " << platform_name + << " due to too old compiler version."; + continue; + } + if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) { + FIRST_VLOG(2) << "Ignoring platform " << platform_name + << ", failed to fetch of devices: " << string(clewErrorString(error)); + continue; + } + if (device_ids.size() == 0) { + FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices."; + continue; + } + for (int num = 0; num < device_ids.size(); num++) { + const cl_device_id device_id = device_ids[num]; + string device_name; + if (!get_device_name(device_id, &device_name, &error)) { + FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error)) + << ", ignoring."; + continue; + } + if (!device_version_check(device_id)) { + FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version."; + continue; + } + if (force_all || device_supported(platform_name, device_id)) { + cl_device_type device_type; + if (!get_device_type(device_id, &device_type, &error)) { + FIRST_VLOG(2) << "Ignoring device " << device_name + << ", failed to fetch device type:" << string(clewErrorString(error)); + continue; + } + string readable_device_name = get_readable_device_name(device_id); + if (readable_device_name != device_name) { + FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name; + } + FIRST_VLOG(2) << "Adding new device " << readable_device_name << "."; + string hardware_id = get_hardware_id(platform_name, device_id); + string device_extensions = get_device_extensions(device_id); + usable_devices->push_back(OpenCLPlatformDevice(platform_id, + platform_name, + device_id, + device_type, + readable_device_name, + hardware_id, + device_extensions)); + } + else { + FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet."; + } + } + } + first_time = false; } -bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, - cl_int *error) +bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error) { - /* Reset from possible previous state. */ - platform_ids->resize(0); - cl_uint num_platforms; - if(!get_num_platforms(&num_platforms, error)) { - return false; - } - /* Get actual platforms. */ - cl_int err; - platform_ids->resize(num_platforms); - if((err = clGetPlatformIDs(num_platforms, - &platform_ids->at(0), - NULL)) != CL_SUCCESS) { - if(error != NULL) { - *error = err; - } - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + /* Reset from possible previous state. */ + platform_ids->resize(0); + cl_uint num_platforms; + if (!get_num_platforms(&num_platforms, error)) { + return false; + } + /* Get actual platforms. */ + cl_int err; + platform_ids->resize(num_platforms); + if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } vector<cl_platform_id> OpenCLInfo::get_platforms() { - vector<cl_platform_id> platform_ids; - get_platforms(&platform_ids); - return platform_ids; + vector<cl_platform_id> platform_ids; + get_platforms(&platform_ids); + return platform_ids; } bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error) { - cl_int err; - if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) { - if(error != NULL) { - *error = err; - } - *num_platforms = 0; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + cl_int err; + if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *num_platforms = 0; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } cl_uint OpenCLInfo::get_num_platforms() { - cl_uint num_platforms; - if(!get_num_platforms(&num_platforms)) { - return 0; - } - return num_platforms; + cl_uint num_platforms; + if (!get_num_platforms(&num_platforms)) { + return 0; + } + return num_platforms; } -bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, - string *platform_name) +bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name) { - char buffer[256]; - if(clGetPlatformInfo(platform_id, - CL_PLATFORM_NAME, - sizeof(buffer), - &buffer, - NULL) != CL_SUCCESS) - { - *platform_name = ""; - return false; - } - *platform_name = buffer; - return true; + char buffer[256]; + if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) != + CL_SUCCESS) { + *platform_name = ""; + return false; + } + *platform_name = buffer; + return true; } string OpenCLInfo::get_platform_name(cl_platform_id platform_id) { - string platform_name; - if(!get_platform_name(platform_id, &platform_name)) { - return ""; - } - return platform_name; + string platform_name; + if (!get_platform_name(platform_id, &platform_name)) { + return ""; + } + return platform_name; } bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, @@ -1076,266 +1034,222 @@ bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, cl_uint *num_devices, cl_int *error) { - cl_int err; - if((err = clGetDeviceIDs(platform_id, - device_type, - 0, - NULL, - num_devices)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - *num_devices = 0; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + cl_int err; + if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *num_devices = 0; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type) { - cl_uint num_devices; - if(!get_num_platform_devices(platform_id, - device_type, - &num_devices)) - { - return 0; - } - return num_devices; + cl_uint num_devices; + if (!get_num_platform_devices(platform_id, device_type, &num_devices)) { + return 0; + } + return num_devices; } bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id, cl_device_type device_type, vector<cl_device_id> *device_ids, - cl_int* error) + cl_int *error) { - /* Reset from possible previous state. */ - device_ids->resize(0); - /* Get number of devices to pre-allocate memory. */ - cl_uint num_devices; - if(!get_num_platform_devices(platform_id, - device_type, - &num_devices, - error)) - { - return false; - } - /* Get actual device list. */ - device_ids->resize(num_devices); - cl_int err; - if((err = clGetDeviceIDs(platform_id, - device_type, - num_devices, - &device_ids->at(0), - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + /* Reset from possible previous state. */ + device_ids->resize(0); + /* Get number of devices to pre-allocate memory. */ + cl_uint num_devices; + if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) { + return false; + } + /* Get actual device list. */ + device_ids->resize(num_devices); + cl_int err; + if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) != + CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id, cl_device_type device_type) { - vector<cl_device_id> devices; - get_platform_devices(platform_id, device_type, &devices); - return devices; + vector<cl_device_id> devices; + get_platform_devices(platform_id, device_type, &devices); + return devices; } -bool OpenCLInfo::get_device_name(cl_device_id device_id, - string *device_name, - cl_int* error) +bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error) { - char buffer[1024]; - cl_int err; - if((err = clGetDeviceInfo(device_id, - CL_DEVICE_NAME, - sizeof(buffer), - &buffer, - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - *device_name = ""; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - *device_name = buffer; - return true; + char buffer[1024]; + cl_int err; + if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) != + CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *device_name = ""; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + *device_name = buffer; + return true; } string OpenCLInfo::get_device_name(cl_device_id device_id) { - string device_name; - if(!get_device_name(device_id, &device_name)) { - return ""; - } - return device_name; + string device_name; + if (!get_device_name(device_id, &device_name)) { + return ""; + } + return device_name; } bool OpenCLInfo::get_device_extensions(cl_device_id device_id, - string *device_extensions, - cl_int* error) + string *device_extensions, + cl_int *error) { - char buffer[1024]; - cl_int err; - if((err = clGetDeviceInfo(device_id, - CL_DEVICE_EXTENSIONS, - sizeof(buffer), - &buffer, - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - *device_extensions = ""; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - *device_extensions = buffer; - return true; + char buffer[1024]; + cl_int err; + if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(buffer), &buffer, NULL)) != + CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *device_extensions = ""; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + *device_extensions = buffer; + return true; } string OpenCLInfo::get_device_extensions(cl_device_id device_id) { - string device_extensions; - if(!get_device_extensions(device_id, &device_extensions)) { - return ""; - } - return device_extensions; + string device_extensions; + if (!get_device_extensions(device_id, &device_extensions)) { + return ""; + } + return device_extensions; } bool OpenCLInfo::get_device_type(cl_device_id device_id, cl_device_type *device_type, - cl_int* error) + cl_int *error) { - cl_int err; - if((err = clGetDeviceInfo(device_id, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - device_type, - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - *device_type = 0; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + cl_int err; + if ((err = clGetDeviceInfo( + device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *device_type = 0; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id) { - cl_device_type device_type; - if(!get_device_type(device_id, &device_type)) { - return 0; - } - return device_type; + cl_device_type device_type; + if (!get_device_type(device_id, &device_type)) { + return 0; + } + return device_type; } string OpenCLInfo::get_readable_device_name(cl_device_id device_id) { - string name = ""; - char board_name[1024]; - size_t length = 0; - if(clGetDeviceInfo(device_id, - CL_DEVICE_BOARD_NAME_AMD, - sizeof(board_name), - &board_name, - &length) == CL_SUCCESS) - { - if(length != 0 && board_name[0] != '\0') { - name = board_name; - } - } - - /* Fallback to standard device name API. */ - if(name.empty()) { - name = get_device_name(device_id); - } - - /* Special exception for AMD Vega, need to be able to tell - * Vega 56 from 64 apart. - */ - if(name == "Radeon RX Vega") { - cl_int max_compute_units = 0; - if(clGetDeviceInfo(device_id, - CL_DEVICE_MAX_COMPUTE_UNITS, - sizeof(max_compute_units), - &max_compute_units, - NULL) == CL_SUCCESS) - { - name += " " + to_string(max_compute_units); - } - } - - /* Distinguish from our native CPU device. */ - if(get_device_type(device_id) & CL_DEVICE_TYPE_CPU) { - name += " (OpenCL)"; - } - - return name; + string name = ""; + char board_name[1024]; + size_t length = 0; + if (clGetDeviceInfo( + device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) == + CL_SUCCESS) { + if (length != 0 && board_name[0] != '\0') { + name = board_name; + } + } + + /* Fallback to standard device name API. */ + if (name.empty()) { + name = get_device_name(device_id); + } + + /* Special exception for AMD Vega, need to be able to tell + * Vega 56 from 64 apart. + */ + if (name == "Radeon RX Vega") { + cl_int max_compute_units = 0; + if (clGetDeviceInfo(device_id, + CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(max_compute_units), + &max_compute_units, + NULL) == CL_SUCCESS) { + name += " " + to_string(max_compute_units); + } + } + + /* Distinguish from our native CPU device. */ + if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) { + name += " (OpenCL)"; + } + + return name; } -bool OpenCLInfo::get_driver_version(cl_device_id device_id, - int *major, - int *minor, - cl_int* error) +bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error) { - char buffer[1024]; - cl_int err; - if((err = clGetDeviceInfo(device_id, - CL_DRIVER_VERSION, - sizeof(buffer), - &buffer, - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - if(sscanf(buffer, "%d.%d", major, minor) < 2) { - VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); - return false; - } - return true; + char buffer[1024]; + cl_int err; + if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) != + CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + if (sscanf(buffer, "%d.%d", major, minor) < 2) { + VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); + return false; + } + return true; } int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id) { - int base_align_bits; - if(clGetDeviceInfo(device_id, - CL_DEVICE_MEM_BASE_ADDR_ALIGN, - sizeof(int), - &base_align_bits, - NULL) == CL_SUCCESS) - { - return base_align_bits/8; - } - return 1; + int base_align_bits; + if (clGetDeviceInfo( + device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) == + CL_SUCCESS) { + return base_align_bits / 8; + } + return 1; } CCL_NAMESPACE_END |