From 949ab753bb2e2d0f76921ed6d716f074ce863f21 Mon Sep 17 00:00:00 2001 From: Jeroen Bakker Date: Wed, 20 Feb 2019 14:41:56 +0100 Subject: Cycles OpenCL: Remove OpenCL MegaKernel Using OpenCL MegaKernel has been slow and therefore not usefull. This patch will remove the mega kernel from the OpenCL codebase and the OpenCLDeviceBase class. T61736: removal of mega kernel T61703: baking does not work with mega kernel Tags: #cycles Differential Revision: https://developer.blender.org/D4383 --- intern/cycles/blender/addon/ui.py | 1 - intern/cycles/blender/blender_python.cpp | 20 +- intern/cycles/device/CMakeLists.txt | 2 - intern/cycles/device/device_opencl.cpp | 18 +- intern/cycles/device/opencl/memory_manager.cpp | 6 +- intern/cycles/device/opencl/memory_manager.h | 10 +- intern/cycles/device/opencl/opencl.h | 78 +- intern/cycles/device/opencl/opencl_base.cpp | 1422 --------------- intern/cycles/device/opencl/opencl_mega.cpp | 186 -- intern/cycles/device/opencl/opencl_split.cpp | 1875 +++++++++++++++++--- intern/cycles/device/opencl/opencl_util.cpp | 75 +- intern/cycles/kernel/CMakeLists.txt | 2 +- intern/cycles/kernel/kernels/opencl/kernel.cl | 148 -- intern/cycles/kernel/kernels/opencl/kernel_base.cl | 88 + intern/cycles/util/util_debug.cpp | 24 +- intern/cycles/util/util_debug.h | 3 - 16 files changed, 1759 insertions(+), 2199 deletions(-) delete mode 100644 intern/cycles/device/opencl/opencl_base.cpp delete mode 100644 intern/cycles/device/opencl/opencl_mega.cpp delete mode 100644 intern/cycles/kernel/kernels/opencl/kernel.cl create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_base.cl (limited to 'intern') diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index e372843d763..5d1d9e764d0 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -1634,7 +1634,6 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel): col = layout.column() col.label('OpenCL Flags:') - col.prop(cscene, "debug_opencl_kernel_type", text="Kernel") col.prop(cscene, "debug_opencl_device_type", text="Device") col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program") col.prop(cscene, "debug_use_opencl_debug", text="Debug") diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 513941b1fcc..da369d0454a 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -67,7 +67,6 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); /* Backup some settings for comparison. */ DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type; - DebugFlags::OpenCL::KernelType opencl_kernel_type = flags.opencl.kernel_type; /* Synchronize shared flags. */ flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type"); /* Synchronize CPU flags. */ @@ -81,18 +80,6 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) /* Synchronize CUDA flags. */ flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile"); flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel"); - /* Synchronize OpenCL kernel type. */ - switch(get_enum(cscene, "debug_opencl_kernel_type")) { - case 0: - flags.opencl.kernel_type = DebugFlags::OpenCL::KERNEL_DEFAULT; - break; - case 1: - flags.opencl.kernel_type = DebugFlags::OpenCL::KERNEL_MEGA; - break; - case 2: - flags.opencl.kernel_type = DebugFlags::OpenCL::KERNEL_SPLIT; - break; - } /* Synchronize OpenCL device type. */ switch(get_enum(cscene, "debug_opencl_device_type")) { case 0: @@ -118,8 +105,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug"); flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit"))*1024*1024; flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program"); - return flags.opencl.device_type != opencl_device_type || - flags.opencl.kernel_type != opencl_kernel_type; + return flags.opencl.device_type != opencl_device_type; } /* Reset debug flags to default values. @@ -130,10 +116,8 @@ bool debug_flags_reset() DebugFlagsRef flags = DebugFlags(); /* Backup some settings for comparison. */ DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type; - DebugFlags::OpenCL::KernelType opencl_kernel_type = flags.opencl.kernel_type; flags.reset(); - return flags.opencl.device_type != opencl_device_type || - flags.opencl.kernel_type != opencl_kernel_type; + return flags.opencl.device_type != opencl_device_type; } } /* namespace */ diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 75e78e038ea..d95cd02a85e 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -37,8 +37,6 @@ set(SRC_OPENCL opencl/opencl.h opencl/memory_manager.h - opencl/opencl_base.cpp - opencl/opencl_mega.cpp opencl/opencl_split.cpp opencl/opencl_util.cpp opencl/memory_manager.cpp diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 1e8c6b2dd0e..948fe407f63 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -29,19 +29,7 @@ CCL_NAMESPACE_BEGIN Device *device_opencl_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) { - vector usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - assert(info.num < usable_devices.size()); - const OpenCLPlatformDevice& platform_device = usable_devices[info.num]; - const string& platform_name = platform_device.platform_name; - const cl_device_type device_type = platform_device.device_type; - if(OpenCLInfo::kernel_use_split(platform_name, device_type)) { - VLOG(1) << "Using split kernel."; - return opencl_create_split_device(info, stats, profiler, background); - } else { - VLOG(1) << "Using mega kernel."; - return opencl_create_mega_device(info, stats, profiler, background); - } + return opencl_create_split_device(info, stats, profiler, background); } bool device_opencl_init() @@ -111,7 +99,6 @@ void device_opencl_info(vector& devices) foreach(OpenCLPlatformDevice& platform_device, usable_devices) { /* Compute unique ID for persistent user preferences. */ const string& platform_name = platform_device.platform_name; - const cl_device_type device_type = platform_device.device_type; const string& device_name = platform_device.device_name; string hardware_id = platform_device.hardware_id; if(hardware_id == "") { @@ -133,8 +120,7 @@ void device_opencl_info(vector& devices) /* We don't know if it's used for display, but assume it is. */ info.display_device = true; info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name); - info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name, - device_type); + info.use_split_kernel = true; info.has_volume_decoupled = false; info.id = id; diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp index 485a656cb83..9cb105982aa 100644 --- a/intern/cycles/device/opencl/memory_manager.cpp +++ b/intern/cycles/device/opencl/memory_manager.cpp @@ -28,7 +28,7 @@ void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation) allocations.push_back(&allocation); } -void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device) +void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device) { bool need_realloc = false; @@ -142,7 +142,7 @@ void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device) clFinish(device->cqCommandQueue); } -void MemoryManager::DeviceBuffer::free(OpenCLDeviceBase *) +void MemoryManager::DeviceBuffer::free(OpenCLDevice *) { buffer->free(); } @@ -160,7 +160,7 @@ MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer() return smallest; } -MemoryManager::MemoryManager(OpenCLDeviceBase *device) +MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false) { foreach(DeviceBuffer& device_buffer, device_buffers) { diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h index b49bd32dab6..8fcc4440369 100644 --- a/intern/cycles/device/opencl/memory_manager.h +++ b/intern/cycles/device/opencl/memory_manager.h @@ -26,7 +26,7 @@ CCL_NAMESPACE_BEGIN -class OpenCLDeviceBase; +class OpenCLDevice; class MemoryManager { public: @@ -73,12 +73,12 @@ private: void add_allocation(Allocation& allocation); - void update_device_memory(OpenCLDeviceBase *device); + void update_device_memory(OpenCLDevice *device); - void free(OpenCLDeviceBase *device); + void free(OpenCLDevice *device); }; - OpenCLDeviceBase *device; + OpenCLDevice *device; DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; @@ -90,7 +90,7 @@ private: DeviceBuffer* smallest_device_buffer(); public: - MemoryManager(OpenCLDeviceBase *device); + MemoryManager(OpenCLDevice *device); void free(); /* Free all memory. */ diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 766b9e4bf1a..1c5f6d375ec 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -18,6 +18,7 @@ #include "device/device.h" #include "device/device_denoising.h" +#include "device/device_split_kernel.h" #include "util/util_map.h" #include "util/util_param.h" @@ -84,8 +85,6 @@ public: static cl_device_type device_type(); static bool use_debug(); static bool kernel_use_advanced_shading(const string& platform_name); - static bool kernel_use_split(const string& platform_name, - const cl_device_type device_type); static bool device_supported(const string& platform_name, const cl_device_id device_id); static bool platform_version_check(cl_platform_id platform, @@ -259,7 +258,7 @@ public: } \ } (void) 0 -class OpenCLDeviceBase : public Device +class OpenCLDevice : public Device { public: DedicatedTaskPool task_pool; @@ -273,7 +272,7 @@ public: class OpenCLProgram { public: OpenCLProgram() : loaded(false), program(NULL), device(NULL) {} - OpenCLProgram(OpenCLDeviceBase *device, + OpenCLProgram(OpenCLDevice *device, const string& program_name, const string& kernel_name, const string& kernel_build_options, @@ -311,7 +310,7 @@ public: bool loaded; cl_program program; - OpenCLDeviceBase *device; + OpenCLDevice *device; /* Used for the OpenCLCache key. */ string program_name; @@ -325,6 +324,32 @@ public: map kernels; }; + DeviceSplitKernel *split_kernel; + + OpenCLProgram program_data_init; + OpenCLProgram program_state_buffer_size; + + OpenCLProgram program_split; + + OpenCLProgram program_path_init; + OpenCLProgram program_scene_intersect; + OpenCLProgram program_lamp_emission; + OpenCLProgram program_do_volume; + OpenCLProgram program_queue_enqueue; + OpenCLProgram program_indirect_background; + OpenCLProgram program_shader_setup; + OpenCLProgram program_shader_sort; + OpenCLProgram program_shader_eval; + OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; + OpenCLProgram program_subsurface_scatter; + OpenCLProgram program_direct_lighting; + OpenCLProgram program_shadow_blocked_ao; + OpenCLProgram program_shadow_blocked_dl; + OpenCLProgram program_enqueue_inactive; + OpenCLProgram program_next_iteration_setup; + OpenCLProgram program_indirect_subsurface; + OpenCLProgram program_buffer_update; + OpenCLProgram base_program; OpenCLProgram bake_program; OpenCLProgram displace_program; @@ -346,8 +371,8 @@ public: void opencl_error(const string& message); void opencl_assert_err(cl_int err, const char* where); - OpenCLDeviceBase(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_); - ~OpenCLDeviceBase(); + OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_); + ~OpenCLDevice(); static void CL_CALLBACK context_notify_callback(const char *err_info, const void * /*private_info*/, size_t /*cb*/, void *user_data); @@ -355,17 +380,14 @@ public: bool opencl_version_check(); string device_md5_hash(string kernel_custom_build_options = ""); - virtual bool load_kernels(const DeviceRequestedFeatures& requested_features); - - /* Has to be implemented by the real device classes. - * The base device will then load all these programs. */ - virtual bool add_kernel_programs(const DeviceRequestedFeatures& requested_features, - vector &programs) = 0; + bool load_kernels(const DeviceRequestedFeatures& requested_features); /* Get the name of the opencl program for the given kernel */ - virtual const string get_opencl_program_name(bool single_program, const string& kernel_name) = 0; + const string get_opencl_program_name(bool single_program, const string& kernel_name); /* Get the program file name to compile (*.cl) for the given kernel */ - virtual const string get_opencl_program_filename(bool single_program, const string& kernel_name) = 0; + const string get_opencl_program_filename(bool single_program, const string& kernel_name); + string get_build_options(const DeviceRequestedFeatures& requested_features); + string get_build_options_for_bake(const DeviceRequestedFeatures& requested_features); void mem_alloc(device_memory& mem); void mem_copy_to(device_memory& mem); @@ -393,10 +415,10 @@ public: class OpenCLDeviceTask : public DeviceTask { public: - OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task) + OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task) : DeviceTask(task) { - run = function_bind(&OpenCLDeviceBase::thread_run, + run = function_bind(&OpenCLDevice::thread_run, device, this); } @@ -422,9 +444,16 @@ public: task_pool.cancel(); } - virtual void thread_run(DeviceTask * /*task*/) = 0; + void thread_run(DeviceTask *task); + + virtual BVHLayoutMask get_bvh_layout_mask() const { + return BVH_LAYOUT_BVH2; + } + + virtual bool show_samples() const { + return true; + } - virtual bool is_split_kernel() = 0; protected: string kernel_build_options(const string *debug_src = NULL); @@ -566,18 +595,15 @@ protected: /* ** Those guys are for workign around some compiler-specific bugs ** */ - virtual cl_program load_cached_kernel( + cl_program load_cached_kernel( ustring key, thread_scoped_lock& cache_locker); - virtual void store_cached_kernel( + void store_cached_kernel( cl_program program, ustring key, thread_scoped_lock& cache_locker); - virtual string build_options_for_bake_program( - const DeviceRequestedFeatures& /*requested_features*/); - private: MemoryManager memory_manager; friend class MemoryManager; @@ -592,9 +618,11 @@ private: protected: void flush_texture_buffers(); + + friend class OpenCLSplitKernel; + friend class OpenCLSplitKernelFunction; }; -Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background); Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background); CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp deleted file mode 100644 index 6a47a60e915..00000000000 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ /dev/null @@ -1,1422 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -#include "device/opencl/opencl.h" - -#include "kernel/kernel_types.h" - -#include "util/util_algorithm.h" -#include "util/util_debug.h" -#include "util/util_foreach.h" -#include "util/util_logging.h" -#include "util/util_md5.h" -#include "util/util_path.h" -#include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -struct texture_slot_t { - texture_slot_t(const string& name, int slot) - : name(name), - slot(slot) { - } - string name; - int slot; -}; - -bool OpenCLDeviceBase::opencl_error(cl_int err) -{ - if(err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - return true; - } - - return false; -} - -void OpenCLDeviceBase::opencl_error(const string& message) -{ - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); -} - -void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where) -{ - if(err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where); - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); -#ifndef NDEBUG - abort(); -#endif - } -} - -OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_) -: Device(info, stats, profiler, background_), - memory_manager(this), - texture_info(this, "__texture_info", MEM_TEXTURE) -{ - cpPlatform = NULL; - cdDevice = NULL; - cxContext = NULL; - cqCommandQueue = NULL; - null_mem = 0; - device_initialized = false; - textures_need_update = true; - - vector usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - if(usable_devices.size() == 0) { - opencl_error("OpenCL: no devices found."); - return; - } - assert(info.num < usable_devices.size()); - OpenCLPlatformDevice& platform_device = usable_devices[info.num]; - device_num = info.num; - cpPlatform = platform_device.platform_id; - cdDevice = platform_device.device_id; - platform_name = platform_device.platform_name; - device_name = platform_device.device_name; - VLOG(2) << "Creating new Cycles device for OpenCL platform " - << platform_name << ", device " - << device_name << "."; - - { - /* try to use cached context */ - thread_scoped_lock cache_locker; - cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker); - - if(cxContext == NULL) { - /* create context properties array to specify platform */ - const cl_context_properties context_props[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, - 0, 0 - }; - - /* create context */ - cxContext = clCreateContext(context_props, 1, &cdDevice, - context_notify_callback, cdDevice, &ciErr); - - if(opencl_error(ciErr)) { - opencl_error("OpenCL: clCreateContext failed"); - return; - } - - /* cache it */ - OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker); - } - } - - cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); - if(opencl_error(ciErr)) { - opencl_error("OpenCL: Error creating command queue"); - return; - } - - null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); - if(opencl_error(ciErr)) { - opencl_error("OpenCL: Error creating memory buffer for NULL"); - return; - } - - /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */ - texture_info.resize(1); - memory_manager.alloc("texture_info", texture_info); - - device_initialized = true; -} - -OpenCLDeviceBase::~OpenCLDeviceBase() -{ - task_pool.stop(); - - memory_manager.free(); - - if(null_mem) - clReleaseMemObject(CL_MEM_PTR(null_mem)); - - ConstMemMap::iterator mt; - for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { - delete mt->second; - } - - base_program.release(); - bake_program.release(); - displace_program.release(); - background_program.release(); - if(cqCommandQueue) - clReleaseCommandQueue(cqCommandQueue); - if(cxContext) - clReleaseContext(cxContext); -} - -void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info, - const void * /*private_info*/, size_t /*cb*/, void *user_data) -{ - string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); - fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); -} - -bool OpenCLDeviceBase::opencl_version_check() -{ - string error; - if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) { - opencl_error(error); - return false; - } - if(!OpenCLInfo::device_version_check(cdDevice, &error)) { - opencl_error(error); - return false; - } - return true; -} - -string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options) -{ - MD5Hash md5; - char version[256], driver[256], name[256], vendor[256]; - - clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL); - clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL); - - md5.append((uint8_t*)vendor, strlen(vendor)); - md5.append((uint8_t*)version, strlen(version)); - md5.append((uint8_t*)name, strlen(name)); - md5.append((uint8_t*)driver, strlen(driver)); - - string options = kernel_build_options(); - options += kernel_custom_build_options; - md5.append((uint8_t*)options.c_str(), options.size()); - - return md5.get_hex(); -} - -bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features) -{ - VLOG(2) << "Loading kernels for platform " << platform_name - << ", device " << device_name << "."; - /* Verify if device was initialized. */ - if(!device_initialized) { - fprintf(stderr, "OpenCL: failed to initialize device.\n"); - return false; - } - - /* Verify we have right opencl version. */ - if(!opencl_version_check()) - return false; - - base_program = OpenCLProgram(this, "base", "kernel.cl", ""); - base_program.add_kernel(ustring("convert_to_byte")); - base_program.add_kernel(ustring("convert_to_half_float")); - base_program.add_kernel(ustring("zero_buffer")); - - bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", build_options_for_bake_program(requested_features)); - bake_program.add_kernel(ustring("bake")); - - displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", build_options_for_bake_program(requested_features)); - displace_program.add_kernel(ustring("displace")); - - background_program = OpenCLProgram(this, "background", "kernel_background.cl", build_options_for_bake_program(requested_features)); - background_program.add_kernel(ustring("background")); - - denoising_program = OpenCLProgram(this, "denoising", "filter.cl", ""); - denoising_program.add_kernel(ustring("filter_divide_shadow")); - denoising_program.add_kernel(ustring("filter_get_feature")); - denoising_program.add_kernel(ustring("filter_detect_outliers")); - denoising_program.add_kernel(ustring("filter_combine_halves")); - denoising_program.add_kernel(ustring("filter_construct_transform")); - denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); - denoising_program.add_kernel(ustring("filter_nlm_blur")); - denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); - denoising_program.add_kernel(ustring("filter_nlm_update_output")); - denoising_program.add_kernel(ustring("filter_nlm_normalize")); - denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); - denoising_program.add_kernel(ustring("filter_finalize")); - - vector programs; - programs.push_back(&bake_program); - programs.push_back(&displace_program); - programs.push_back(&background_program); - /* Call actual class to fill the vector with its programs. */ - if(!add_kernel_programs(requested_features, programs)) { - return false; - } - programs.push_back(&base_program); - programs.push_back(&denoising_program); - - /* Parallel compilation of Cycles kernels, this launches multiple - * processes to workaround OpenCL frameworks serializing the calls - * internally within a single process. */ - TaskPool task_pool; - foreach(OpenCLProgram *program, programs) { - task_pool.push(function_bind(&OpenCLProgram::load, program)); - } - task_pool.wait_work(); - - foreach(OpenCLProgram *program, programs) { - VLOG(2) << program->get_log(); - if(!program->is_loaded()) { - program->report_error(); - return false; - } - } - - return true; -} - -void OpenCLDeviceBase::mem_alloc(device_memory& mem) -{ - if(mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - size_t size = mem.memory_size(); - - /* check there is enough memory available for the allocation */ - cl_ulong max_alloc_size = 0; - clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); - - if(DebugFlags().opencl.mem_limit) { - max_alloc_size = min(max_alloc_size, - cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); - } - - if(size > max_alloc_size) { - string error = "Scene too complex to fit in available memory."; - if(mem.name != NULL) { - error += string_printf(" (allocating buffer %s failed.)", mem.name); - } - set_error(error); - - return; - } - - cl_mem_flags mem_flag; - void *mem_ptr = NULL; - - if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - /* Zero-size allocation might be invoked by render, but not really - * supported by OpenCL. Using NULL as device pointer also doesn't really - * work for some reason, so for the time being we'll use special case - * will null_mem buffer. - */ - if(size != 0) { - mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, - mem_flag, - size, - mem_ptr, - &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer"); - } - else { - mem.device_pointer = null_mem; - } - - stats.mem_alloc(size); - mem.device_size = size; -} - -void OpenCLDeviceBase::mem_copy_to(device_memory& mem) -{ - if(mem.type == MEM_TEXTURE) { - tex_free(mem); - tex_alloc(mem); - } - else { - if(!mem.device_pointer) { - mem_alloc(mem); - } - - /* this is blocking */ - size_t size = mem.memory_size(); - if(size != 0) { - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - size, - mem.host_pointer, - 0, - NULL, NULL)); - } - } -} - -void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, int elem) -{ - size_t offset = elem*y*w; - size_t size = elem*w*h; - assert(size != 0); - opencl_assert(clEnqueueReadBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - offset, - size, - (uchar*)mem.host_pointer + offset, - 0, - NULL, NULL)); -} - -void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size) -{ - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; - - cl_mem d_buffer = CL_MEM_PTR(mem); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; - - while(d_offset < size) { - d_size = std::min(num_threads*sizeof(float4), size - d_offset); - - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, - ckZeroBuffer, - 2, - NULL, - global_size, - NULL, - 0, - NULL, - NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - - d_offset += d_size; - } -} - -void OpenCLDeviceBase::mem_zero(device_memory& mem) -{ - if(!mem.device_pointer) { - mem_alloc(mem); - } - - if(mem.device_pointer) { - if(base_program.is_loaded()) { - mem_zero_kernel(mem.device_pointer, mem.memory_size()); - } - - if(mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } - - if(!base_program.is_loaded()) { - void* zero = mem.host_pointer; - - if(!mem.host_pointer) { - zero = util_aligned_malloc(mem.memory_size(), 16); - memset(zero, 0, mem.memory_size()); - } - - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - mem.memory_size(), - zero, - 0, - NULL, NULL)); - - if(!mem.host_pointer) { - util_aligned_free(zero); - } - } - } -} - -void OpenCLDeviceBase::mem_free(device_memory& mem) -{ - if(mem.type == MEM_TEXTURE) { - tex_free(mem); - } - else { - if(mem.device_pointer) { - if(mem.device_pointer != null_mem) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); - } - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } -} - -int OpenCLDeviceBase::mem_sub_ptr_alignment() -{ - return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); -} - -device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size) -{ - cl_mem_flags mem_flag; - if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - cl_buffer_region info; - info.origin = mem.memory_elements_size(offset); - info.size = mem.memory_elements_size(size); - - device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), - mem_flag, - CL_BUFFER_CREATE_TYPE_REGION, - &info, - &ciErr); - opencl_assert_err(ciErr, "clCreateSubBuffer"); - return sub_buf; -} - -void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer) -{ - if(device_pointer && device_pointer != null_mem) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); - } -} - -void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) -{ - ConstMemMap::iterator i = const_mem_map.find(name); - device_vector *data; - - if(i == const_mem_map.end()) { - data = new device_vector(this, name, MEM_READ_ONLY); - data->alloc(size); - const_mem_map.insert(ConstMemMap::value_type(name, data)); - } - else { - data = i->second; - } - - memcpy(data->data(), host, size); - data->copy_to_device(); -} - -void OpenCLDeviceBase::tex_alloc(device_memory& mem) -{ - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - memory_manager.alloc(mem.name, mem); - /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ - mem.device_pointer = 1; - textures[mem.name] = &mem; - textures_need_update = true; -} - -void OpenCLDeviceBase::tex_free(device_memory& mem) -{ - if(mem.device_pointer) { - mem.device_pointer = 0; - - if(memory_manager.free(mem)) { - textures_need_update = true; - } - - foreach(TexturesMap::value_type& value, textures) { - if(value.second == &mem) { - textures.erase(value.first); - break; - } - } - } -} - -size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) -{ - int r = global_size % group_size; - return global_size + ((r == 0)? 0: group_size - r); -} - -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) -{ - size_t workgroup_size, max_work_items[3]; - - clGetKernelWorkGroupInfo(kernel, cdDevice, - CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); - clGetDeviceInfo(cdDevice, - CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); - - if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { - workgroup_size = max_workgroup_size; - } - - /* Try to divide evenly over 2 dimensions. */ - size_t local_size[2]; - if(x_workgroups) { - local_size[0] = workgroup_size; - local_size[1] = 1; - } - else { - size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); - local_size[0] = local_size[1] = sqrt_workgroup_size; - } - - /* Some implementations have max size 1 on 2nd dimension. */ - if(local_size[1] > max_work_items[1]) { - local_size[0] = workgroup_size/max_work_items[1]; - local_size[1] = max_work_items[1]; - } - - size_t global_size[2] = {global_size_round_up(local_size[0], w), - global_size_round_up(local_size[1], h)}; - - /* Vertical size of 1 is coming from bake/shade kernels where we should - * not round anything up because otherwise we'll either be doing too - * much work per pixel (if we don't check global ID on Y axis) or will - * be checking for global ID to always have Y of 0. - */ - if(h == 1) { - global_size[h] = 1; - } - - /* run kernel */ - opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); - opencl_assert(clFlush(cqCommandQueue)); -} - -void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name) -{ - cl_mem ptr; - - MemMap::iterator i = mem_map.find(name); - if(i != mem_map.end()) { - ptr = CL_MEM_PTR(i->second); - } - else { - /* work around NULL not working, even though the spec says otherwise */ - ptr = CL_MEM_PTR(null_mem); - } - - opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr)); -} - -void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) -{ - flush_texture_buffers(); - - memory_manager.set_kernel_arg_buffers(kernel, narg); -} - -void OpenCLDeviceBase::flush_texture_buffers() -{ - if(!textures_need_update) { - return; - } - textures_need_update = false; - - /* Setup slots for textures. */ - int num_slots = 0; - - vector texture_slots; - -#define KERNEL_TEX(type, name) \ - if(textures.find(#name) != textures.end()) { \ - texture_slots.push_back(texture_slot_t(#name, num_slots)); \ - } \ - num_slots++; -#include "kernel/kernel_textures.h" - - int num_data_slots = num_slots; - - foreach(TexturesMap::value_type& tex, textures) { - string name = tex.first; - - if(string_startswith(name, "__tex_image")) { - int pos = name.rfind("_"); - int id = atoi(name.data() + pos + 1); - texture_slots.push_back(texture_slot_t(name, - num_data_slots + id)); - num_slots = max(num_slots, num_data_slots + id + 1); - } - } - - /* Realloc texture descriptors buffer. */ - memory_manager.free(texture_info); - texture_info.resize(num_slots); - memory_manager.alloc("texture_info", texture_info); - - /* Fill in descriptors */ - foreach(texture_slot_t& slot, texture_slots) { - TextureInfo& info = texture_info[slot.slot]; - - MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); - info.data = desc.offset; - info.cl_buffer = desc.device_buffer; - - if(string_startswith(slot.name, "__tex_image")) { - device_memory *mem = textures[slot.name]; - - info.width = mem->data_width; - info.height = mem->data_height; - info.depth = mem->data_depth; - - info.interpolation = mem->interpolation; - info.extension = mem->extension; - } - } - - /* Force write of descriptors. */ - memory_manager.free(texture_info); - memory_manager.alloc("texture_info", texture_info); -} - -void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) -{ - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half); - cl_mem d_buffer = CL_MEM_PTR(buffer); - cl_int d_x = task.x; - cl_int d_y = task.y; - cl_int d_w = task.w; - cl_int d_h = task.h; - cl_float d_sample_scale = 1.0f/(task.sample + 1); - cl_int d_offset = task.offset; - cl_int d_stride = task.stride; - - - cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float")); - - cl_uint start_arg_index = - kernel_set_args(ckFilmConvertKernel, - 0, - d_data, - d_rgba, - d_buffer); - - set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); - - start_arg_index += kernel_set_args(ckFilmConvertKernel, - start_arg_index, - d_sample_scale, - d_x, - d_y, - d_w, - d_h, - d_offset, - d_stride); - - enqueue_kernel(ckFilmConvertKernel, d_w, d_h); -} - -bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task) -{ - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); - device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); - device_sub_ptr weightAccum(task->buffer.temporary_mem, 2*pass_stride*num_shifts, pass_stride); - cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem guide_mem = CL_MEM_PTR(guide_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem out_mem = CL_MEM_PTR(out_ptr); - cl_mem scale_mem = NULL; - - mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride); - mem_zero_kernel(out_ptr, sizeof(float)*pass_stride); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); - cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); - - kernel_set_args(ckNLMCalcDifference, 0, - guide_mem, - variance_mem, - scale_mem, - difference_mem, - w, h, stride, - pass_stride, - r, channel_offset, - 0, a, k_2); - kernel_set_args(ckNLMBlur, 0, - difference_mem, - blurDifference_mem, - w, h, stride, - pass_stride, - r, f); - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference_mem, - difference_mem, - w, h, stride, - pass_stride, - r, f); - kernel_set_args(ckNLMUpdateOutput, 0, - blurDifference_mem, - image_mem, - out_mem, - weightAccum_mem, - w, h, stride, - pass_stride, - channel_offset, - r, f); - - enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true); - - kernel_set_args(ckNLMNormalize, 0, - out_mem, weightAccum_mem, w, h, stride); - enqueue_kernel(ckNLMNormalize, w, h); - - return true; -} - -bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) -{ - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - char use_time = task->buffer.use_time? 1 : 0; - - cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - - int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, - buffer_mem, - tile_info_mem); - cl_mem buffers[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterConstructTransform, - arg_ofs, - buffers[i]); - } - kernel_set_args(ckFilterConstructTransform, - arg_ofs, - transform_mem, - rank_mem, - task->filter_area, - task->rect, - task->buffer.pass_stride, - task->buffer.frame_stride, - use_time, - task->radius, - task->pca_threshold); - - enqueue_kernel(ckFilterConstructTransform, - task->storage.w, - task->storage.h, - 256); - - return true; -} - -bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) -{ - cl_mem color_mem = CL_MEM_PTR(color_ptr); - cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); - cl_mem scale_mem = CL_MEM_PTR(scale_ptr); - - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - char use_time = task->buffer.use_time? 1 : 0; - - int r = task->radius; - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); - device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - kernel_set_args(ckNLMCalcDifference, 0, - color_mem, - color_variance_mem, - scale_mem, - difference_mem, - w, h, stride, - pass_stride, - r, - pass_stride, - frame_offset, - 1.0f, task->nlm_k_2); - kernel_set_args(ckNLMBlur, 0, - difference_mem, - blurDifference_mem, - w, h, stride, - pass_stride, - r, 4); - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference_mem, - difference_mem, - w, h, stride, - pass_stride, - r, 4); - kernel_set_args(ckNLMConstructGramian, 0, - t, - blurDifference_mem, - buffer_mem, - transform_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->reconstruction_state.filter_window, - w, h, stride, - pass_stride, - r, 4, - frame_offset, - use_time); - - enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); - - return true; -} - -bool OpenCLDeviceBase::denoising_solve(device_ptr output_ptr, - DenoisingTask *task) -{ - cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); - - cl_mem output_mem = CL_MEM_PTR(output_ptr); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - - kernel_set_args(ckFinalize, 0, - output_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->filter_area, - task->reconstruction_state.buffer_params, - task->render_buffer.samples); - enqueue_kernel(ckFinalize, w, h); - - return true; -} - -bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, int4 rect, - DenoisingTask *task) -{ - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); - - kernel_set_args(ckFilterCombineHalves, 0, - mean_mem, - variance_mem, - a_mem, - b_mem, - rect, - r); - enqueue_kernel(ckFilterCombineHalves, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; -} - -bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task) -{ - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); - cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); - cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); - - int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0, - task->render_buffer.samples, - tile_info_mem); - cl_mem buffers[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, - buffers[i]); - } - kernel_set_args(ckFilterDivideShadow, arg_ofs, - a_mem, - b_mem, - sample_variance_mem, - sv_variance_mem, - buffer_variance_mem, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterDivideShadow, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; -} - -bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) -{ - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); - - int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, - task->render_buffer.samples, - tile_info_mem); - cl_mem buffers[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, - buffers[i]); - } - kernel_set_args(ckFilterGetFeature, arg_ofs, - mean_offset, - variance_offset, - mean_mem, - variance_mem, - scale, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterGetFeature, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; -} - -bool OpenCLDeviceBase::denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) -{ - cl_mem from_mem = CL_MEM_PTR(from_ptr); - cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); - - cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); - - kernel_set_args(ckFilterWriteFeature, 0, - task->render_buffer.samples, - task->reconstruction_state.buffer_params, - task->filter_area, - from_mem, - buffer_mem, - out_offset, - task->rect); - enqueue_kernel(ckFilterWriteFeature, - task->filter_area.z, - task->filter_area.w); - - return true; -} - -bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) -{ - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem depth_mem = CL_MEM_PTR(depth_ptr); - cl_mem output_mem = CL_MEM_PTR(output_ptr); - - cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); - - kernel_set_args(ckFilterDetectOutliers, 0, - image_mem, - variance_mem, - depth_mem, - output_mem, - task->rect, - task->buffer.pass_stride); - enqueue_kernel(ckFilterDetectOutliers, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; -} - -void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising) -{ - denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind(&OpenCLDeviceBase::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(&rtile); -} - -void OpenCLDeviceBase::shader(DeviceTask& task) -{ - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_input = CL_MEM_PTR(task.shader_input); - cl_mem d_output = CL_MEM_PTR(task.shader_output); - cl_int d_shader_eval_type = task.shader_eval_type; - cl_int d_shader_filter = task.shader_filter; - cl_int d_shader_x = task.shader_x; - cl_int d_shader_w = task.shader_w; - cl_int d_offset = task.offset; - - cl_kernel kernel; - - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - kernel = bake_program(ustring("bake")); - } - else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { - kernel = displace_program(ustring("displace")); - } - else { - kernel = background_program(ustring("background")); - } - - cl_uint start_arg_index = - kernel_set_args(kernel, - 0, - d_data, - d_input, - d_output); - - set_kernel_arg_buffers(kernel, &start_arg_index); - - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_eval_type); - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_filter); - } - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_x, - d_shader_w, - d_offset); - - for(int sample = 0; sample < task.num_samples; sample++) { - - if(task.get_cancel()) - break; - - kernel_set_args(kernel, start_arg_index, sample); - - enqueue_kernel(kernel, task.shader_w, 1); - - clFinish(cqCommandQueue); - - task.update_progress(NULL); - } -} - -string OpenCLDeviceBase::kernel_build_options(const string *debug_src) -{ - string build_options = "-cl-no-signed-zeros -cl-mad-enable "; - - if(platform_name == "NVIDIA CUDA") { - build_options += "-D__KERNEL_OPENCL_NVIDIA__ " - "-cl-nv-maxrregcount=32 " - "-cl-nv-verbose "; - - uint compute_capability_major, compute_capability_minor; - clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, - sizeof(cl_uint), &compute_capability_major, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, - sizeof(cl_uint), &compute_capability_minor, NULL); - - build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ", - compute_capability_major * 100 + - compute_capability_minor * 10); - } - - else if(platform_name == "Apple") - build_options += "-D__KERNEL_OPENCL_APPLE__ "; - - else if(platform_name == "AMD Accelerated Parallel Processing") - build_options += "-D__KERNEL_OPENCL_AMD__ "; - - else if(platform_name == "Intel(R) OpenCL") { - build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; - - /* Options for gdb source level kernel debugging. - * this segfaults on linux currently. - */ - if(OpenCLInfo::use_debug() && debug_src) - build_options += "-g -s \"" + *debug_src + "\" "; - } - - if(info.has_half_images) { - build_options += "-D__KERNEL_CL_KHR_FP16__ "; - } - - if(OpenCLInfo::use_debug()) { - build_options += "-D__KERNEL_OPENCL_DEBUG__ "; - } - -#ifdef WITH_CYCLES_DEBUG - build_options += "-D__KERNEL_DEBUG__ "; -#endif - - return build_options; -} - -/* TODO(sergey): In the future we can use variadic templates, once - * C++0x is allowed. Should allow to clean this up a bit. - */ -int OpenCLDeviceBase::kernel_set_args(cl_kernel kernel, - int start_argument_index, - const ArgumentWrapper& arg1, - const ArgumentWrapper& arg2, - const ArgumentWrapper& arg3, - const ArgumentWrapper& arg4, - const ArgumentWrapper& arg5, - const ArgumentWrapper& arg6, - const ArgumentWrapper& arg7, - const ArgumentWrapper& arg8, - const ArgumentWrapper& arg9, - const ArgumentWrapper& arg10, - const ArgumentWrapper& arg11, - const ArgumentWrapper& arg12, - const ArgumentWrapper& arg13, - const ArgumentWrapper& arg14, - const ArgumentWrapper& arg15, - const ArgumentWrapper& arg16, - const ArgumentWrapper& arg17, - const ArgumentWrapper& arg18, - const ArgumentWrapper& arg19, - const ArgumentWrapper& arg20, - const ArgumentWrapper& arg21, - const ArgumentWrapper& arg22, - const ArgumentWrapper& arg23, - const ArgumentWrapper& arg24, - const ArgumentWrapper& arg25, - const ArgumentWrapper& arg26, - const ArgumentWrapper& arg27, - const ArgumentWrapper& arg28, - const ArgumentWrapper& arg29, - const ArgumentWrapper& arg30, - const ArgumentWrapper& arg31, - const ArgumentWrapper& arg32, - const ArgumentWrapper& arg33) -{ - int current_arg_index = 0; -#define FAKE_VARARG_HANDLE_ARG(arg) \ - do { \ - if(arg.pointer != NULL) { \ - opencl_assert(clSetKernelArg( \ - kernel, \ - start_argument_index + current_arg_index, \ - arg.size, arg.pointer)); \ - ++current_arg_index; \ - } \ - else { \ - return current_arg_index; \ - } \ - } while(false) - FAKE_VARARG_HANDLE_ARG(arg1); - FAKE_VARARG_HANDLE_ARG(arg2); - FAKE_VARARG_HANDLE_ARG(arg3); - FAKE_VARARG_HANDLE_ARG(arg4); - FAKE_VARARG_HANDLE_ARG(arg5); - FAKE_VARARG_HANDLE_ARG(arg6); - FAKE_VARARG_HANDLE_ARG(arg7); - FAKE_VARARG_HANDLE_ARG(arg8); - FAKE_VARARG_HANDLE_ARG(arg9); - FAKE_VARARG_HANDLE_ARG(arg10); - FAKE_VARARG_HANDLE_ARG(arg11); - FAKE_VARARG_HANDLE_ARG(arg12); - FAKE_VARARG_HANDLE_ARG(arg13); - FAKE_VARARG_HANDLE_ARG(arg14); - FAKE_VARARG_HANDLE_ARG(arg15); - FAKE_VARARG_HANDLE_ARG(arg16); - FAKE_VARARG_HANDLE_ARG(arg17); - FAKE_VARARG_HANDLE_ARG(arg18); - FAKE_VARARG_HANDLE_ARG(arg19); - FAKE_VARARG_HANDLE_ARG(arg20); - FAKE_VARARG_HANDLE_ARG(arg21); - FAKE_VARARG_HANDLE_ARG(arg22); - FAKE_VARARG_HANDLE_ARG(arg23); - FAKE_VARARG_HANDLE_ARG(arg24); - FAKE_VARARG_HANDLE_ARG(arg25); - FAKE_VARARG_HANDLE_ARG(arg26); - FAKE_VARARG_HANDLE_ARG(arg27); - FAKE_VARARG_HANDLE_ARG(arg28); - FAKE_VARARG_HANDLE_ARG(arg29); - FAKE_VARARG_HANDLE_ARG(arg30); - FAKE_VARARG_HANDLE_ARG(arg31); - FAKE_VARARG_HANDLE_ARG(arg32); - FAKE_VARARG_HANDLE_ARG(arg33); -#undef FAKE_VARARG_HANDLE_ARG - return current_arg_index; -} - -void OpenCLDeviceBase::release_kernel_safe(cl_kernel kernel) -{ - if(kernel) { - clReleaseKernel(kernel); - } -} - -void OpenCLDeviceBase::release_mem_object_safe(cl_mem mem) -{ - if(mem != NULL) { - clReleaseMemObject(mem); - } -} - -void OpenCLDeviceBase::release_program_safe(cl_program program) -{ - if(program) { - clReleaseProgram(program); - } -} - -/* ** Those guys are for workign around some compiler-specific bugs ** */ - -cl_program OpenCLDeviceBase::load_cached_kernel( - ustring key, - thread_scoped_lock& cache_locker) -{ - return OpenCLCache::get_program(cpPlatform, - cdDevice, - key, - cache_locker); -} - -void OpenCLDeviceBase::store_cached_kernel( - cl_program program, - ustring key, - thread_scoped_lock& cache_locker) -{ - OpenCLCache::store_program(cpPlatform, - cdDevice, - program, - key, - cache_locker); -} - -string OpenCLDeviceBase::build_options_for_bake_program( - const DeviceRequestedFeatures& requested_features) -{ - /* TODO(sergey): By default we compile all features, meaning - * mega kernel is not getting feature-based optimizations. - * - * Ideally we need always compile kernel with as less features - * enabled as possible to keep performance at it's max. - */ - - /* For now disable baking when not in use as this has major - * impact on kernel build times. - */ - if(!requested_features.use_baking) { - return "-D__NO_BAKING__"; - } - - return ""; -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp deleted file mode 100644 index c0b9e81d4d3..00000000000 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -#include "device/opencl/opencl.h" - -#include "render/buffers.h" - -#include "kernel/kernel_types.h" - -#include "util/util_md5.h" -#include "util/util_path.h" -#include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -class OpenCLDeviceMegaKernel : public OpenCLDeviceBase -{ -public: - OpenCLProgram path_trace_program; - - OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_) - : OpenCLDeviceBase(info, stats, profiler, background_), - path_trace_program(this, - get_opencl_program_name(false, "megakernel"), - get_opencl_program_filename(false, "megakernel"), - "-D__COMPILE_ONLY_MEGAKERNEL__ ") - { - } - - - virtual bool show_samples() const - { - return true; - } - - virtual BVHLayoutMask get_bvh_layout_mask() const - { - return BVH_LAYOUT_BVH2; - } - - const string get_opencl_program_name(bool /*single_program*/, const string& kernel_name) - { - return kernel_name; - } - - const string get_opencl_program_filename(bool /*single_program*/, const string& /*kernel_name*/) - { - return "kernel.cl"; - } - - virtual bool add_kernel_programs(const DeviceRequestedFeatures& /*requested_features*/, - vector &programs) - { - path_trace_program.add_kernel(ustring("path_trace")); - programs.push_back(&path_trace_program); - return true; - } - - ~OpenCLDeviceMegaKernel() - { - task_pool.stop(); - path_trace_program.release(); - } - - void path_trace(RenderTile& rtile, int sample) - { - scoped_timer timer(&rtile.buffers->render_time); - - /* Cast arguments to cl types. */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); - cl_int d_x = rtile.x; - cl_int d_y = rtile.y; - cl_int d_w = rtile.w; - cl_int d_h = rtile.h; - cl_int d_offset = rtile.offset; - cl_int d_stride = rtile.stride; - - /* Sample arguments. */ - cl_int d_sample = sample; - - cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace")); - - cl_uint start_arg_index = - kernel_set_args(ckPathTraceKernel, - 0, - d_data, - d_buffer); - - set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index); - - start_arg_index += kernel_set_args(ckPathTraceKernel, - start_arg_index, - d_sample, - d_x, - d_y, - d_w, - d_h, - d_offset, - d_stride); - - enqueue_kernel(ckPathTraceKernel, d_w, d_h); - } - - void thread_run(DeviceTask *task) - { - if(task->type == DeviceTask::FILM_CONVERT) { - film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); - } - else if(task->type == DeviceTask::SHADER) { - shader(*task); - } - else if(task->type == DeviceTask::RENDER) { - RenderTile tile; - DenoisingTask denoising(this, *task); - - /* Keep rendering tiles until done. */ - while(task->acquire_tile(this, tile)) { - if(tile.task == RenderTile::PATH_TRACE) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } - - path_trace(tile, sample); - - tile.sample = sample + 1; - - task->update_progress(&tile, tile.w*tile.h); - } - - /* Complete kernel execution before release tile */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - } - else if(tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - denoise(tile, denoising); - task->update_progress(&tile, tile.w*tile.h); - } - - task->release_tile(tile); - } - } - } - - bool is_split_kernel() - { - return false; - } -}; - -Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background) -{ - return new OpenCLDeviceMegaKernel(info, stats, profiler, background); -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index c9d3eb2eb8c..853b2addb20 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -18,15 +18,12 @@ #include "device/opencl/opencl.h" -#include "render/buffers.h" - #include "kernel/kernel_types.h" #include "kernel/split/kernel_split_data_types.h" -#include "device/device_split_kernel.h" - #include "util/util_algorithm.h" #include "util/util_debug.h" +#include "util/util_foreach.h" #include "util/util_logging.h" #include "util/util_md5.h" #include "util/util_path.h" @@ -34,308 +31,96 @@ CCL_NAMESPACE_BEGIN -class OpenCLSplitKernel; - -namespace { - -/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to - * fetch its size. - */ -typedef struct KernelGlobalsDummy { - ccl_constant KernelData *data; - ccl_global char *buffers[8]; - -#define KERNEL_TEX(type, name) \ - TextureInfo name; -# include "kernel/kernel_textures.h" -#undef KERNEL_TEX - SplitData split_data; - SplitParams split_param_data; -} KernelGlobalsDummy; - -} // namespace - -static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features) -{ - string build_options = "-D__SPLIT_KERNEL__ "; - build_options += requested_features.get_build_options(); - - /* Set compute device build option. */ - cl_device_type device_type; - OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr); - assert(device->ciErr == CL_SUCCESS); - if(device_type == CL_DEVICE_TYPE_GPU) { - build_options += " -D__COMPUTE_DEVICE_GPU__"; +struct texture_slot_t { + texture_slot_t(const string& name, int slot) + : name(name), + slot(slot) { } + string name; + int slot; +}; - return build_options; -} - -/* OpenCLDeviceSplitKernel's declaration/definition. */ -class OpenCLDeviceSplitKernel : public OpenCLDeviceBase +static const string fast_compiled_kernels = + "path_init " + "scene_intersect " + "queue_enqueue " + "shader_setup " + "shader_sort " + "enqueue_inactive " + "next_iteration_setup " + "indirect_subsurface " + "buffer_update"; + +const string OpenCLDevice::get_opencl_program_name(bool single_program, const string& kernel_name) { -public: - DeviceSplitKernel *split_kernel; - OpenCLProgram program_data_init; - OpenCLProgram program_state_buffer_size; - - OpenCLProgram program_split; - - OpenCLProgram program_path_init; - OpenCLProgram program_scene_intersect; - OpenCLProgram program_lamp_emission; - OpenCLProgram program_do_volume; - OpenCLProgram program_queue_enqueue; - OpenCLProgram program_indirect_background; - OpenCLProgram program_shader_setup; - OpenCLProgram program_shader_sort; - OpenCLProgram program_shader_eval; - OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; - OpenCLProgram program_subsurface_scatter; - OpenCLProgram program_direct_lighting; - OpenCLProgram program_shadow_blocked_ao; - OpenCLProgram program_shadow_blocked_dl; - OpenCLProgram program_enqueue_inactive; - OpenCLProgram program_next_iteration_setup; - OpenCLProgram program_indirect_subsurface; - OpenCLProgram program_buffer_update; - - OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_); - - ~OpenCLDeviceSplitKernel() - { - task_pool.stop(); - - /* Release kernels */ - program_data_init.release(); - - delete split_kernel; + if (single_program) { + return "split"; } - - virtual bool show_samples() const { - return true; - } - - virtual BVHLayoutMask get_bvh_layout_mask() const { - return BVH_LAYOUT_BVH2; - } - - virtual bool load_kernels(const DeviceRequestedFeatures& requested_features) - { - if (!OpenCLDeviceBase::load_kernels(requested_features)) { - return false; - } - return split_kernel->load_kernels(requested_features); - } - - const string fast_compiled_kernels = - "path_init " - "scene_intersect " - "queue_enqueue " - "shader_setup " - "shader_sort " - "enqueue_inactive " - "next_iteration_setup " - "indirect_subsurface " - "buffer_update"; - - const string get_opencl_program_name(bool single_program, const string& kernel_name) - { - if (single_program) { - return "split"; + else { + if (fast_compiled_kernels.find(kernel_name) != std::string::npos) { + return "split_bundle"; } else { - if (fast_compiled_kernels.find(kernel_name) != std::string::npos) { - return "split_bundle"; - } - else { - return "split_" + kernel_name; - } + return "split_" + kernel_name; } } +} - const string get_opencl_program_filename(bool single_program, const string& kernel_name) - { - if (single_program) { - return "kernel_split.cl"; +const string OpenCLDevice::get_opencl_program_filename(bool single_program, const string& kernel_name) +{ + if (single_program) { + return "kernel_split.cl"; + } + else { + if (fast_compiled_kernels.find(kernel_name) != std::string::npos) { + return "kernel_split_bundle.cl"; } else { - if (fast_compiled_kernels.find(kernel_name) != std::string::npos) { - return "kernel_split_bundle.cl"; - } - else { - return "kernel_" + kernel_name + ".cl"; - } + return "kernel_" + kernel_name + ".cl"; } } +} - virtual bool add_kernel_programs(const DeviceRequestedFeatures& requested_features, - vector &programs) - { - bool single_program = OpenCLInfo::use_single_program(); - program_data_init = OpenCLDeviceBase::OpenCLProgram( - this, - get_opencl_program_name(single_program, "data_init"), - get_opencl_program_filename(single_program, "data_init"), - get_build_options(this, requested_features)); - program_data_init.add_kernel(ustring("path_trace_data_init")); - programs.push_back(&program_data_init); - - program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram( - this, - get_opencl_program_name(single_program, "state_buffer_size"), - get_opencl_program_filename(single_program, "state_buffer_size"), - get_build_options(this, requested_features)); - - program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); - programs.push_back(&program_state_buffer_size); - - -#define ADD_SPLIT_KERNEL_SINGLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name)); -#define ADD_SPLIT_KERNEL_SPLIT_PROGRAM(kernel_name) \ - program_##kernel_name = \ - OpenCLDeviceBase::OpenCLProgram(this, \ - "split_"#kernel_name, \ - "kernel_"#kernel_name".cl", \ - get_build_options(this, requested_features)); \ - program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \ - programs.push_back(&program_##kernel_name); - - if (single_program) { - program_split = OpenCLDeviceBase::OpenCLProgram( - this, - "split" , - "kernel_split.cl", - get_build_options(this, requested_features)); - - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(lamp_emission); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(do_volume); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_background); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_eval); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(holdout_emission_blurring_pathtermination_ao); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(subsurface_scatter); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(direct_lighting); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_ao); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_dl); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update); - - programs.push_back(&program_split); - } - else { - /* Ordered with most complex kernels first, to reduce overall compile time. */ - ADD_SPLIT_KERNEL_SPLIT_PROGRAM(subsurface_scatter); - ADD_SPLIT_KERNEL_SPLIT_PROGRAM(do_volume); - ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_dl); - ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_ao); - ADD_SPLIT_KERNEL_SPLIT_PROGRAM(holdout_emission_blurring_pathtermination_ao); - ADD_SPLIT_KERNEL_SPLIT_PROGRAM(lamp_emission); - ADD_SPLIT_KERNEL_SPLIT_PROGRAM(direct_lighting); - ADD_SPLIT_KERNEL_SPLIT_PROGRAM(indirect_background); - ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shader_eval); - - /* Quick kernels bundled in a single program to reduce overhead of starting - * Blender processes. */ - program_split = OpenCLDeviceBase::OpenCLProgram( - this, - "split_bundle" , - "kernel_split_bundle.cl", - get_build_options(this, requested_features)); - - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface); - ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update); - programs.push_back(&program_split); - } -#undef ADD_SPLIT_KERNEL_SPLIT_PROGRAM -#undef ADD_SPLIT_KERNEL_SINGLE_PROGRAM +string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_features) +{ + string build_options = "-D__SPLIT_KERNEL__ "; + build_options += requested_features.get_build_options(); - return true; + /* Set compute device build option. */ + cl_device_type device_type; + OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr); + assert(this->ciErr == CL_SUCCESS); + if(device_type == CL_DEVICE_TYPE_GPU) { + build_options += " -D__COMPUTE_DEVICE_GPU__"; } - void thread_run(DeviceTask *task) - { - flush_texture_buffers(); + return build_options; +} - if(task->type == DeviceTask::FILM_CONVERT) { - film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); - } - else if(task->type == DeviceTask::SHADER) { - shader(*task); - } - else if(task->type == DeviceTask::RENDER) { - RenderTile tile; - DenoisingTask denoising(this, *task); - - /* Allocate buffer for kernel globals */ - device_only_memory kgbuffer(this, "kernel_globals"); - kgbuffer.alloc_to_device(1); - - /* Keep rendering tiles until done. */ - while(task->acquire_tile(this, tile)) { - if(tile.task == RenderTile::PATH_TRACE) { - assert(tile.task == RenderTile::PATH_TRACE); - scoped_timer timer(&tile.buffers->render_time); - - split_kernel->path_trace(task, - tile, - kgbuffer, - *const_mem_map["__data"]); - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - } - else if(tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - denoise(tile, denoising); - task->update_progress(&tile, tile.w*tile.h); - } - - task->release_tile(tile); - } +string OpenCLDevice::get_build_options_for_bake(const DeviceRequestedFeatures& requested_features) +{ + return requested_features.get_build_options(); +} - kgbuffer.free(); - } - } +namespace { - bool is_split_kernel() - { - return true; - } +/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to + * fetch its size. + */ +typedef struct KernelGlobalsDummy { + ccl_constant KernelData *data; + ccl_global char *buffers[8]; -protected: - /* ** Those guys are for workign around some compiler-specific bugs ** */ +#define KERNEL_TEX(type, name) \ + TextureInfo name; +# include "kernel/kernel_textures.h" +#undef KERNEL_TEX + SplitData split_data; + SplitParams split_param_data; +} KernelGlobalsDummy; - string build_options_for_bake_program( - const DeviceRequestedFeatures& requested_features) - { - return requested_features.get_build_options(); - } +} // namespace - friend class OpenCLSplitKernel; - friend class OpenCLSplitKernelFunction; -}; struct CachedSplitMemory { int id; @@ -349,12 +134,12 @@ struct CachedSplitMemory { class OpenCLSplitKernelFunction : public SplitKernelFunction { public: - OpenCLDeviceSplitKernel* device; - OpenCLDeviceBase::OpenCLProgram program; + OpenCLDevice* device; + OpenCLDevice::OpenCLProgram program; CachedSplitMemory& cached_memory; int cached_id; - OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) : + OpenCLSplitKernelFunction(OpenCLDevice* device, CachedSplitMemory& cached_memory) : device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1) { } @@ -412,10 +197,10 @@ public: }; class OpenCLSplitKernel : public DeviceSplitKernel { - OpenCLDeviceSplitKernel *device; + OpenCLDevice *device; CachedSplitMemory cached_memory; public: - explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) { + explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) { } virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, @@ -425,10 +210,10 @@ public: bool single_program = OpenCLInfo::use_single_program(); kernel->program = - OpenCLDeviceBase::OpenCLProgram(device, + OpenCLDevice::OpenCLProgram(device, device->get_opencl_program_name(single_program, kernel_name), device->get_opencl_program_filename(single_program, kernel_name), - get_build_options(device, requested_features)); + device->get_build_options(requested_features)); kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); kernel->program.load(); @@ -593,19 +378,1519 @@ public: } }; -OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_) -: OpenCLDeviceBase(info, stats, profiler, background_) +bool OpenCLDevice::opencl_error(cl_int err) +{ + if(err != CL_SUCCESS) { + string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); + if(error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); + return true; + } + + return false; +} + +void OpenCLDevice::opencl_error(const string& message) +{ + if(error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); +} + +void OpenCLDevice::opencl_assert_err(cl_int err, const char* where) +{ + if(err != CL_SUCCESS) { + string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where); + if(error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); +#ifndef NDEBUG + abort(); +#endif + } +} + +OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) +: Device(info, stats, profiler, background), + memory_manager(this), + texture_info(this, "__texture_info", MEM_TEXTURE) { + cpPlatform = NULL; + cdDevice = NULL; + cxContext = NULL; + cqCommandQueue = NULL; + null_mem = 0; + device_initialized = false; + textures_need_update = true; + + vector usable_devices; + OpenCLInfo::get_usable_devices(&usable_devices); + if(usable_devices.size() == 0) { + opencl_error("OpenCL: no devices found."); + return; + } + assert(info.num < usable_devices.size()); + OpenCLPlatformDevice& platform_device = usable_devices[info.num]; + device_num = info.num; + cpPlatform = platform_device.platform_id; + cdDevice = platform_device.device_id; + platform_name = platform_device.platform_name; + device_name = platform_device.device_name; + VLOG(2) << "Creating new Cycles device for OpenCL platform " + << platform_name << ", device " + << device_name << "."; + + { + /* try to use cached context */ + thread_scoped_lock cache_locker; + cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker); + + if(cxContext == NULL) { + /* create context properties array to specify platform */ + const cl_context_properties context_props[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, + 0, 0 + }; + + /* create context */ + cxContext = clCreateContext(context_props, 1, &cdDevice, + context_notify_callback, cdDevice, &ciErr); + + if(opencl_error(ciErr)) { + opencl_error("OpenCL: clCreateContext failed"); + return; + } + + /* cache it */ + OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker); + } + } + + cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); + if(opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating command queue"); + return; + } + + null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); + if(opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating memory buffer for NULL"); + return; + } + + /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */ + texture_info.resize(1); + memory_manager.alloc("texture_info", texture_info); + + device_initialized = true; + split_kernel = new OpenCLSplitKernel(this); + background = background; +} + +OpenCLDevice::~OpenCLDevice() +{ + task_pool.stop(); + + memory_manager.free(); + + if(null_mem) + clReleaseMemObject(CL_MEM_PTR(null_mem)); + + ConstMemMap::iterator mt; + for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { + delete mt->second; + } + + base_program.release(); + bake_program.release(); + displace_program.release(); + background_program.release(); + + program_data_init.release(); + + if(cqCommandQueue) + clReleaseCommandQueue(cqCommandQueue); + if(cxContext) + clReleaseContext(cxContext); + + delete split_kernel; +} + +void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info, + const void * /*private_info*/, size_t /*cb*/, void *user_data) +{ + string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); + fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); +} + +bool OpenCLDevice::opencl_version_check() +{ + string error; + if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) { + opencl_error(error); + return false; + } + if(!OpenCLInfo::device_version_check(cdDevice, &error)) { + opencl_error(error); + return false; + } + return true; +} + +string OpenCLDevice::device_md5_hash(string kernel_custom_build_options) +{ + MD5Hash md5; + char version[256], driver[256], name[256], vendor[256]; + + clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL); + clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); + clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL); + clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL); + + md5.append((uint8_t*)vendor, strlen(vendor)); + md5.append((uint8_t*)version, strlen(version)); + md5.append((uint8_t*)name, strlen(name)); + md5.append((uint8_t*)driver, strlen(driver)); + + string options = kernel_build_options(); + options += kernel_custom_build_options; + md5.append((uint8_t*)options.c_str(), options.size()); + + return md5.get_hex(); +} + +bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures& requested_features) +{ + VLOG(2) << "Loading kernels for platform " << platform_name + << ", device " << device_name << "."; + /* Verify if device was initialized. */ + if(!device_initialized) { + fprintf(stderr, "OpenCL: failed to initialize device.\n"); + return false; + } + + /* Verify we have right opencl version. */ + if(!opencl_version_check()) + return false; + + base_program = OpenCLProgram(this, "base", "kernel_base.cl", ""); + base_program.add_kernel(ustring("convert_to_byte")); + base_program.add_kernel(ustring("convert_to_half_float")); + base_program.add_kernel(ustring("zero_buffer")); + + bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", get_build_options_for_bake(requested_features)); + bake_program.add_kernel(ustring("bake")); + + displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", get_build_options_for_bake(requested_features)); + displace_program.add_kernel(ustring("displace")); + + background_program = OpenCLProgram(this, "background", "kernel_background.cl", get_build_options_for_bake(requested_features)); + background_program.add_kernel(ustring("background")); + + denoising_program = OpenCLProgram(this, "denoising", "filter.cl", ""); + denoising_program.add_kernel(ustring("filter_divide_shadow")); + denoising_program.add_kernel(ustring("filter_get_feature")); + denoising_program.add_kernel(ustring("filter_detect_outliers")); + denoising_program.add_kernel(ustring("filter_combine_halves")); + denoising_program.add_kernel(ustring("filter_construct_transform")); + denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); + denoising_program.add_kernel(ustring("filter_nlm_blur")); + denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); + denoising_program.add_kernel(ustring("filter_nlm_update_output")); + denoising_program.add_kernel(ustring("filter_nlm_normalize")); + denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); + denoising_program.add_kernel(ustring("filter_finalize")); + + vector programs; + programs.push_back(&bake_program); + programs.push_back(&displace_program); + programs.push_back(&background_program); + + bool single_program = OpenCLInfo::use_single_program(); + program_data_init = OpenCLDevice::OpenCLProgram( + this, + get_opencl_program_name(single_program, "data_init"), + get_opencl_program_filename(single_program, "data_init"), + get_build_options(requested_features)); + program_data_init.add_kernel(ustring("path_trace_data_init")); + programs.push_back(&program_data_init); + + program_state_buffer_size = OpenCLDevice::OpenCLProgram( + this, + get_opencl_program_name(single_program, "state_buffer_size"), + get_opencl_program_filename(single_program, "state_buffer_size"), + get_build_options(requested_features)); + + program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); + programs.push_back(&program_state_buffer_size); + + +#define ADD_SPLIT_KERNEL_SINGLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name)); +#define ADD_SPLIT_KERNEL_SPLIT_PROGRAM(kernel_name) \ + program_##kernel_name = \ + OpenCLDevice::OpenCLProgram(this, \ + "split_"#kernel_name, \ + "kernel_"#kernel_name".cl", \ + get_build_options(requested_features)); \ + program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \ + programs.push_back(&program_##kernel_name); + + if (single_program) { + program_split = OpenCLDevice::OpenCLProgram( + this, + "split" , + "kernel_split.cl", + get_build_options(requested_features)); + + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(lamp_emission); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(do_volume); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_background); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_eval); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(holdout_emission_blurring_pathtermination_ao); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(subsurface_scatter); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(direct_lighting); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_ao); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_dl); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update); + + programs.push_back(&program_split); + } + else { + /* Ordered with most complex kernels first, to reduce overall compile time. */ + ADD_SPLIT_KERNEL_SPLIT_PROGRAM(subsurface_scatter); + ADD_SPLIT_KERNEL_SPLIT_PROGRAM(do_volume); + ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_dl); + ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_ao); + ADD_SPLIT_KERNEL_SPLIT_PROGRAM(holdout_emission_blurring_pathtermination_ao); + ADD_SPLIT_KERNEL_SPLIT_PROGRAM(lamp_emission); + ADD_SPLIT_KERNEL_SPLIT_PROGRAM(direct_lighting); + ADD_SPLIT_KERNEL_SPLIT_PROGRAM(indirect_background); + ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shader_eval); + + /* Quick kernels bundled in a single program to reduce overhead of starting + * Blender processes. */ + program_split = OpenCLDevice::OpenCLProgram( + this, + "split_bundle" , + "kernel_split_bundle.cl", + get_build_options(requested_features)); + + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface); + ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update); + programs.push_back(&program_split); + } +#undef ADD_SPLIT_KERNEL_SPLIT_PROGRAM +#undef ADD_SPLIT_KERNEL_SINGLE_PROGRAM + + programs.push_back(&base_program); + programs.push_back(&denoising_program); + + /* Parallel compilation of Cycles kernels, this launches multiple + * processes to workaround OpenCL frameworks serializing the calls + * internally within a single process. */ + TaskPool task_pool; + foreach(OpenCLProgram *program, programs) { + task_pool.push(function_bind(&OpenCLProgram::load, program)); + } + task_pool.wait_work(); + + foreach(OpenCLProgram *program, programs) { + VLOG(2) << program->get_log(); + if(!program->is_loaded()) { + program->report_error(); + return false; + } + } + + return split_kernel->load_kernels(requested_features); +} + +void OpenCLDevice::mem_alloc(device_memory& mem) +{ + if(mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + size_t size = mem.memory_size(); + + /* check there is enough memory available for the allocation */ + cl_ulong max_alloc_size = 0; + clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); + + if(DebugFlags().opencl.mem_limit) { + max_alloc_size = min(max_alloc_size, + cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); + } + + if(size > max_alloc_size) { + string error = "Scene too complex to fit in available memory."; + if(mem.name != NULL) { + error += string_printf(" (allocating buffer %s failed.)", mem.name); + } + set_error(error); + + return; + } + + cl_mem_flags mem_flag; + void *mem_ptr = NULL; + + if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) + mem_flag = CL_MEM_READ_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + /* Zero-size allocation might be invoked by render, but not really + * supported by OpenCL. Using NULL as device pointer also doesn't really + * work for some reason, so for the time being we'll use special case + * will null_mem buffer. + */ + if(size != 0) { + mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, + mem_flag, + size, + mem_ptr, + &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer"); + } + else { + mem.device_pointer = null_mem; + } + + stats.mem_alloc(size); + mem.device_size = size; +} + +void OpenCLDevice::mem_copy_to(device_memory& mem) +{ + if(mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if(!mem.device_pointer) { + mem_alloc(mem); + } + + /* this is blocking */ + size_t size = mem.memory_size(); + if(size != 0) { + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + size, + mem.host_pointer, + 0, + NULL, NULL)); + } + } +} + +void OpenCLDevice::mem_copy_from(device_memory& mem, int y, int w, int h, int elem) +{ + size_t offset = elem*y*w; + size_t size = elem*w*h; + assert(size != 0); + opencl_assert(clEnqueueReadBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + offset, + size, + (uchar*)mem.host_pointer + offset, + 0, + NULL, NULL)); +} + +void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size) +{ + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - background = background_; + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; + + cl_mem d_buffer = CL_MEM_PTR(mem); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; + + while(d_offset < size) { + d_size = std::min(num_threads*sizeof(float4), size - d_offset); + + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + + d_offset += d_size; + } +} + +void OpenCLDevice::mem_zero(device_memory& mem) +{ + if(!mem.device_pointer) { + mem_alloc(mem); + } + + if(mem.device_pointer) { + if(base_program.is_loaded()) { + mem_zero_kernel(mem.device_pointer, mem.memory_size()); + } + + if(mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } + + if(!base_program.is_loaded()) { + void* zero = mem.host_pointer; + + if(!mem.host_pointer) { + zero = util_aligned_malloc(mem.memory_size(), 16); + memset(zero, 0, mem.memory_size()); + } + + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + mem.memory_size(), + zero, + 0, + NULL, NULL)); + + if(!mem.host_pointer) { + util_aligned_free(zero); + } + } + } +} + +void OpenCLDevice::mem_free(device_memory& mem) +{ + if(mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + if(mem.device_pointer) { + if(mem.device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); + } + mem.device_pointer = 0; + + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } +} + +int OpenCLDevice::mem_sub_ptr_alignment() +{ + return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); +} + +device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory& mem, int offset, int size) +{ + cl_mem_flags mem_flag; + if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) + mem_flag = CL_MEM_READ_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + cl_buffer_region info; + info.origin = mem.memory_elements_size(offset); + info.size = mem.memory_elements_size(size); + + device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), + mem_flag, + CL_BUFFER_CREATE_TYPE_REGION, + &info, + &ciErr); + opencl_assert_err(ciErr, "clCreateSubBuffer"); + return sub_buf; +} + +void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer) +{ + if(device_pointer && device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); + } +} + +void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size) +{ + ConstMemMap::iterator i = const_mem_map.find(name); + device_vector *data; + + if(i == const_mem_map.end()) { + data = new device_vector(this, name, MEM_READ_ONLY); + data->alloc(size); + const_mem_map.insert(ConstMemMap::value_type(name, data)); + } + else { + data = i->second; + } + + memcpy(data->data(), host, size); + data->copy_to_device(); +} + +void OpenCLDevice::tex_alloc(device_memory& mem) +{ + VLOG(1) << "Texture allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + memory_manager.alloc(mem.name, mem); + /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ + mem.device_pointer = 1; + textures[mem.name] = &mem; + textures_need_update = true; +} + +void OpenCLDevice::tex_free(device_memory& mem) +{ + if(mem.device_pointer) { + mem.device_pointer = 0; + + if(memory_manager.free(mem)) { + textures_need_update = true; + } + + foreach(TexturesMap::value_type& value, textures) { + if(value.second == &mem) { + textures.erase(value.first); + break; + } + } + } +} + +size_t OpenCLDevice::global_size_round_up(int group_size, int global_size) +{ + int r = global_size % group_size; + return global_size + ((r == 0)? 0: group_size - r); +} + +void OpenCLDevice::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) +{ + size_t workgroup_size, max_work_items[3]; + + clGetKernelWorkGroupInfo(kernel, cdDevice, + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); + clGetDeviceInfo(cdDevice, + CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); + + if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { + workgroup_size = max_workgroup_size; + } + + /* Try to divide evenly over 2 dimensions. */ + size_t local_size[2]; + if(x_workgroups) { + local_size[0] = workgroup_size; + local_size[1] = 1; + } + else { + size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); + local_size[0] = local_size[1] = sqrt_workgroup_size; + } + + /* Some implementations have max size 1 on 2nd dimension. */ + if(local_size[1] > max_work_items[1]) { + local_size[0] = workgroup_size/max_work_items[1]; + local_size[1] = max_work_items[1]; + } + + size_t global_size[2] = {global_size_round_up(local_size[0], w), + global_size_round_up(local_size[1], h)}; + + /* Vertical size of 1 is coming from bake/shade kernels where we should + * not round anything up because otherwise we'll either be doing too + * much work per pixel (if we don't check global ID on Y axis) or will + * be checking for global ID to always have Y of 0. + */ + if(h == 1) { + global_size[h] = 1; + } + + /* run kernel */ + opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); + opencl_assert(clFlush(cqCommandQueue)); +} + +void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name) +{ + cl_mem ptr; + + MemMap::iterator i = mem_map.find(name); + if(i != mem_map.end()) { + ptr = CL_MEM_PTR(i->second); + } + else { + /* work around NULL not working, even though the spec says otherwise */ + ptr = CL_MEM_PTR(null_mem); + } + + opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr)); +} + +void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) +{ + flush_texture_buffers(); + + memory_manager.set_kernel_arg_buffers(kernel, narg); +} + +void OpenCLDevice::flush_texture_buffers() +{ + if(!textures_need_update) { + return; + } + textures_need_update = false; + + /* Setup slots for textures. */ + int num_slots = 0; + + vector texture_slots; + +#define KERNEL_TEX(type, name) \ + if(textures.find(#name) != textures.end()) { \ + texture_slots.push_back(texture_slot_t(#name, num_slots)); \ + } \ + num_slots++; +#include "kernel/kernel_textures.h" + + int num_data_slots = num_slots; + + foreach(TexturesMap::value_type& tex, textures) { + string name = tex.first; + + if(string_startswith(name, "__tex_image")) { + int pos = name.rfind("_"); + int id = atoi(name.data() + pos + 1); + texture_slots.push_back(texture_slot_t(name, + num_data_slots + id)); + num_slots = max(num_slots, num_data_slots + id + 1); + } + } + + /* Realloc texture descriptors buffer. */ + memory_manager.free(texture_info); + texture_info.resize(num_slots); + memory_manager.alloc("texture_info", texture_info); + + /* Fill in descriptors */ + foreach(texture_slot_t& slot, texture_slots) { + TextureInfo& info = texture_info[slot.slot]; + + MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); + info.data = desc.offset; + info.cl_buffer = desc.device_buffer; + + if(string_startswith(slot.name, "__tex_image")) { + device_memory *mem = textures[slot.name]; + + info.width = mem->data_width; + info.height = mem->data_height; + info.depth = mem->data_depth; + + info.interpolation = mem->interpolation; + info.extension = mem->extension; + } + } + + /* Force write of descriptors. */ + memory_manager.free(texture_info); + memory_manager.alloc("texture_info", texture_info); +} + + +void OpenCLDevice::thread_run(DeviceTask *task) +{ + flush_texture_buffers(); + + if(task->type == DeviceTask::FILM_CONVERT) { + film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); + } + else if(task->type == DeviceTask::SHADER) { + shader(*task); + } + else if(task->type == DeviceTask::RENDER) { + RenderTile tile; + DenoisingTask denoising(this, *task); + + /* Allocate buffer for kernel globals */ + device_only_memory kgbuffer(this, "kernel_globals"); + kgbuffer.alloc_to_device(1); + + /* Keep rendering tiles until done. */ + while(task->acquire_tile(this, tile)) { + if(tile.task == RenderTile::PATH_TRACE) { + assert(tile.task == RenderTile::PATH_TRACE); + scoped_timer timer(&tile.buffers->render_time); + + split_kernel->path_trace(task, + tile, + kgbuffer, + *const_mem_map["__data"]); + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, denoising); + task->update_progress(&tile, tile.w*tile.h); + } + + task->release_tile(tile); + } + + kgbuffer.free(); + } +} + +void OpenCLDevice::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) +{ + /* cast arguments to cl types */ + cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); + cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half); + cl_mem d_buffer = CL_MEM_PTR(buffer); + cl_int d_x = task.x; + cl_int d_y = task.y; + cl_int d_w = task.w; + cl_int d_h = task.h; + cl_float d_sample_scale = 1.0f/(task.sample + 1); + cl_int d_offset = task.offset; + cl_int d_stride = task.stride; + + + cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float")); + + cl_uint start_arg_index = + kernel_set_args(ckFilmConvertKernel, + 0, + d_data, + d_rgba, + d_buffer); + + set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); + + start_arg_index += kernel_set_args(ckFilmConvertKernel, + start_arg_index, + d_sample_scale, + d_x, + d_y, + d_w, + d_h, + d_offset, + d_stride); + + enqueue_kernel(ckFilmConvertKernel, d_w, d_h); +} + +bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) +{ + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2*r+1)*(2*r+1); + int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; + + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); + device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); + device_sub_ptr weightAccum(task->buffer.temporary_mem, 2*pass_stride*num_shifts, pass_stride); + cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); + + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem out_mem = CL_MEM_PTR(out_ptr); + cl_mem scale_mem = NULL; + + mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride); + mem_zero_kernel(out_ptr, sizeof(float)*pass_stride); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); + cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); + + kernel_set_args(ckNLMCalcDifference, 0, + guide_mem, + variance_mem, + scale_mem, + difference_mem, + w, h, stride, + pass_stride, + r, channel_offset, + 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference_mem, + blurDifference_mem, + w, h, stride, + pass_stride, + r, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference_mem, + difference_mem, + w, h, stride, + pass_stride, + r, f); + kernel_set_args(ckNLMUpdateOutput, 0, + blurDifference_mem, + image_mem, + out_mem, + weightAccum_mem, + w, h, stride, + pass_stride, + channel_offset, + r, f); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true); + + kernel_set_args(ckNLMNormalize, 0, + out_mem, weightAccum_mem, w, h, stride); + enqueue_kernel(ckNLMNormalize, w, h); + + return true; +} + +bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task) +{ + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + char use_time = task->buffer.use_time? 1 : 0; + + cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); + + int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterConstructTransform, + arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterConstructTransform, + arg_ofs, + transform_mem, + rank_mem, + task->filter_area, + task->rect, + task->buffer.pass_stride, + task->buffer.frame_stride, + use_time, + task->radius, + task->pca_threshold); + + enqueue_kernel(ckFilterConstructTransform, + task->storage.w, + task->storage.h, + 256); + + return true; +} + +bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) +{ + cl_mem color_mem = CL_MEM_PTR(color_ptr); + cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); + cl_mem scale_mem = CL_MEM_PTR(scale_ptr); + + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + char use_time = task->buffer.use_time? 1 : 0; + + int r = task->radius; + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2*r+1)*(2*r+1); + + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); + device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); + + kernel_set_args(ckNLMCalcDifference, 0, + color_mem, + color_variance_mem, + scale_mem, + difference_mem, + w, h, stride, + pass_stride, + r, + pass_stride, + frame_offset, + 1.0f, task->nlm_k_2); + kernel_set_args(ckNLMBlur, 0, + difference_mem, + blurDifference_mem, + w, h, stride, + pass_stride, + r, 4); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference_mem, + difference_mem, + w, h, stride, + pass_stride, + r, 4); + kernel_set_args(ckNLMConstructGramian, 0, + t, + blurDifference_mem, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->reconstruction_state.filter_window, + w, h, stride, + pass_stride, + r, 4, + frame_offset, + use_time); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); + + return true; +} + +bool OpenCLDevice::denoising_solve(device_ptr output_ptr, + DenoisingTask *task) +{ + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem output_mem = CL_MEM_PTR(output_ptr); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + + kernel_set_args(ckFinalize, 0, + output_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->filter_area, + task->reconstruction_state.buffer_params, + task->render_buffer.samples); + enqueue_kernel(ckFinalize, w, h); + + return true; +} + +bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); + + kernel_set_args(ckFilterCombineHalves, 0, + mean_mem, + variance_mem, + a_mem, + b_mem, + rect, + r); + enqueue_kernel(ckFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); + cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); + cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); + + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); + + int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0, + task->render_buffer.samples, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterDivideShadow, arg_ofs, + a_mem, + b_mem, + sample_variance_mem, + sv_variance_mem, + buffer_variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.offset); + enqueue_kernel(ckFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDevice::denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale, + DenoisingTask *task) +{ + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); + + int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, + task->render_buffer.samples, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterGetFeature, arg_ofs, + mean_offset, + variance_offset, + mean_mem, + variance_mem, + scale, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.offset); + enqueue_kernel(ckFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDevice::denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) +{ + cl_mem from_mem = CL_MEM_PTR(from_ptr); + cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); + + cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); + + kernel_set_args(ckFilterWriteFeature, 0, + task->render_buffer.samples, + task->reconstruction_state.buffer_params, + task->filter_area, + from_mem, + buffer_mem, + out_offset, + task->rect); + enqueue_kernel(ckFilterWriteFeature, + task->filter_area.z, + task->filter_area.w); + + return true; +} + +bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem depth_mem = CL_MEM_PTR(depth_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); + + kernel_set_args(ckFilterDetectOutliers, 0, + image_mem, + variance_mem, + depth_mem, + output_mem, + task->rect, + task->buffer.pass_stride); + enqueue_kernel(ckFilterDetectOutliers, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask& denoising) +{ + denoising.functions.construct_transform = function_bind(&OpenCLDevice::denoising_construct_transform, this, &denoising); + denoising.functions.accumulate = function_bind(&OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising); + denoising.functions.divide_shadow = function_bind(&OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind(&OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); + denoising.functions.detect_outliers = function_bind(&OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + denoising.buffer.gpu_temporary_mem = true; + + denoising.run_denoising(&rtile); +} + +void OpenCLDevice::shader(DeviceTask& task) +{ + /* cast arguments to cl types */ + cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); + cl_mem d_input = CL_MEM_PTR(task.shader_input); + cl_mem d_output = CL_MEM_PTR(task.shader_output); + cl_int d_shader_eval_type = task.shader_eval_type; + cl_int d_shader_filter = task.shader_filter; + cl_int d_shader_x = task.shader_x; + cl_int d_shader_w = task.shader_w; + cl_int d_offset = task.offset; + + cl_kernel kernel; + + if(task.shader_eval_type >= SHADER_EVAL_BAKE) { + kernel = bake_program(ustring("bake")); + } + else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { + kernel = displace_program(ustring("displace")); + } + else { + kernel = background_program(ustring("background")); + } + + cl_uint start_arg_index = + kernel_set_args(kernel, + 0, + d_data, + d_input, + d_output); + + set_kernel_arg_buffers(kernel, &start_arg_index); + + start_arg_index += kernel_set_args(kernel, + start_arg_index, + d_shader_eval_type); + if(task.shader_eval_type >= SHADER_EVAL_BAKE) { + start_arg_index += kernel_set_args(kernel, + start_arg_index, + d_shader_filter); + } + start_arg_index += kernel_set_args(kernel, + start_arg_index, + d_shader_x, + d_shader_w, + d_offset); + + for(int sample = 0; sample < task.num_samples; sample++) { + + if(task.get_cancel()) + break; + + kernel_set_args(kernel, start_arg_index, sample); + + enqueue_kernel(kernel, task.shader_w, 1); + + clFinish(cqCommandQueue); + + task.update_progress(NULL); + } +} + +string OpenCLDevice::kernel_build_options(const string *debug_src) +{ + string build_options = "-cl-no-signed-zeros -cl-mad-enable "; + + if(platform_name == "NVIDIA CUDA") { + build_options += "-D__KERNEL_OPENCL_NVIDIA__ " + "-cl-nv-maxrregcount=32 " + "-cl-nv-verbose "; + + uint compute_capability_major, compute_capability_minor; + clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), &compute_capability_major, NULL); + clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), &compute_capability_minor, NULL); + + build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ", + compute_capability_major * 100 + + compute_capability_minor * 10); + } + + else if(platform_name == "Apple") + build_options += "-D__KERNEL_OPENCL_APPLE__ "; + + else if(platform_name == "AMD Accelerated Parallel Processing") + build_options += "-D__KERNEL_OPENCL_AMD__ "; + + else if(platform_name == "Intel(R) OpenCL") { + build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; + + /* Options for gdb source level kernel debugging. + * this segfaults on linux currently. + */ + if(OpenCLInfo::use_debug() && debug_src) + build_options += "-g -s \"" + *debug_src + "\" "; + } + + if(info.has_half_images) { + build_options += "-D__KERNEL_CL_KHR_FP16__ "; + } + + if(OpenCLInfo::use_debug()) { + build_options += "-D__KERNEL_OPENCL_DEBUG__ "; + } + +#ifdef WITH_CYCLES_DEBUG + build_options += "-D__KERNEL_DEBUG__ "; +#endif + + return build_options; +} + +/* TODO(sergey): In the future we can use variadic templates, once + * C++0x is allowed. Should allow to clean this up a bit. + */ +int OpenCLDevice::kernel_set_args(cl_kernel kernel, + int start_argument_index, + const ArgumentWrapper& arg1, + const ArgumentWrapper& arg2, + const ArgumentWrapper& arg3, + const ArgumentWrapper& arg4, + const ArgumentWrapper& arg5, + const ArgumentWrapper& arg6, + const ArgumentWrapper& arg7, + const ArgumentWrapper& arg8, + const ArgumentWrapper& arg9, + const ArgumentWrapper& arg10, + const ArgumentWrapper& arg11, + const ArgumentWrapper& arg12, + const ArgumentWrapper& arg13, + const ArgumentWrapper& arg14, + const ArgumentWrapper& arg15, + const ArgumentWrapper& arg16, + const ArgumentWrapper& arg17, + const ArgumentWrapper& arg18, + const ArgumentWrapper& arg19, + const ArgumentWrapper& arg20, + const ArgumentWrapper& arg21, + const ArgumentWrapper& arg22, + const ArgumentWrapper& arg23, + const ArgumentWrapper& arg24, + const ArgumentWrapper& arg25, + const ArgumentWrapper& arg26, + const ArgumentWrapper& arg27, + const ArgumentWrapper& arg28, + const ArgumentWrapper& arg29, + const ArgumentWrapper& arg30, + const ArgumentWrapper& arg31, + const ArgumentWrapper& arg32, + const ArgumentWrapper& arg33) +{ + int current_arg_index = 0; +#define FAKE_VARARG_HANDLE_ARG(arg) \ + do { \ + if(arg.pointer != NULL) { \ + opencl_assert(clSetKernelArg( \ + kernel, \ + start_argument_index + current_arg_index, \ + arg.size, arg.pointer)); \ + ++current_arg_index; \ + } \ + else { \ + return current_arg_index; \ + } \ + } while(false) + FAKE_VARARG_HANDLE_ARG(arg1); + FAKE_VARARG_HANDLE_ARG(arg2); + FAKE_VARARG_HANDLE_ARG(arg3); + FAKE_VARARG_HANDLE_ARG(arg4); + FAKE_VARARG_HANDLE_ARG(arg5); + FAKE_VARARG_HANDLE_ARG(arg6); + FAKE_VARARG_HANDLE_ARG(arg7); + FAKE_VARARG_HANDLE_ARG(arg8); + FAKE_VARARG_HANDLE_ARG(arg9); + FAKE_VARARG_HANDLE_ARG(arg10); + FAKE_VARARG_HANDLE_ARG(arg11); + FAKE_VARARG_HANDLE_ARG(arg12); + FAKE_VARARG_HANDLE_ARG(arg13); + FAKE_VARARG_HANDLE_ARG(arg14); + FAKE_VARARG_HANDLE_ARG(arg15); + FAKE_VARARG_HANDLE_ARG(arg16); + FAKE_VARARG_HANDLE_ARG(arg17); + FAKE_VARARG_HANDLE_ARG(arg18); + FAKE_VARARG_HANDLE_ARG(arg19); + FAKE_VARARG_HANDLE_ARG(arg20); + FAKE_VARARG_HANDLE_ARG(arg21); + FAKE_VARARG_HANDLE_ARG(arg22); + FAKE_VARARG_HANDLE_ARG(arg23); + FAKE_VARARG_HANDLE_ARG(arg24); + FAKE_VARARG_HANDLE_ARG(arg25); + FAKE_VARARG_HANDLE_ARG(arg26); + FAKE_VARARG_HANDLE_ARG(arg27); + FAKE_VARARG_HANDLE_ARG(arg28); + FAKE_VARARG_HANDLE_ARG(arg29); + FAKE_VARARG_HANDLE_ARG(arg30); + FAKE_VARARG_HANDLE_ARG(arg31); + FAKE_VARARG_HANDLE_ARG(arg32); + FAKE_VARARG_HANDLE_ARG(arg33); +#undef FAKE_VARARG_HANDLE_ARG + return current_arg_index; +} + +void OpenCLDevice::release_kernel_safe(cl_kernel kernel) +{ + if(kernel) { + clReleaseKernel(kernel); + } +} + +void OpenCLDevice::release_mem_object_safe(cl_mem mem) +{ + if(mem != NULL) { + clReleaseMemObject(mem); + } +} + +void OpenCLDevice::release_program_safe(cl_program program) +{ + if(program) { + clReleaseProgram(program); + } +} + +/* ** Those guys are for workign around some compiler-specific bugs ** */ + +cl_program OpenCLDevice::load_cached_kernel( + ustring key, + thread_scoped_lock& cache_locker) +{ + return OpenCLCache::get_program(cpPlatform, + cdDevice, + key, + cache_locker); +} + +void OpenCLDevice::store_cached_kernel( + cl_program program, + ustring key, + thread_scoped_lock& cache_locker) +{ + OpenCLCache::store_program(cpPlatform, + cdDevice, + program, + key, + cache_locker); } Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background) { - return new OpenCLDeviceSplitKernel(info, stats, profiler, background); + return new OpenCLDevice(info, stats, profiler, background); } CCL_NAMESPACE_END -#endif /* WITH_OPENCL */ +#endif diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index a6a80b0c2de..8ef622fad01 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -243,7 +243,7 @@ string OpenCLCache::get_kernel_md5() return self.kernel_md5; } -OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device, +OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device, const string& program_name, const string& kernel_file, const string& kernel_build_options, @@ -258,12 +258,12 @@ OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device, program = NULL; } -OpenCLDeviceBase::OpenCLProgram::~OpenCLProgram() +OpenCLDevice::OpenCLProgram::~OpenCLProgram() { release(); } -void OpenCLDeviceBase::OpenCLProgram::release() +void OpenCLDevice::OpenCLProgram::release() { for(map::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) { if(kernel->second) { @@ -277,7 +277,7 @@ void OpenCLDeviceBase::OpenCLProgram::release() } } -void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug) +void OpenCLDevice::OpenCLProgram::add_log(const string& msg, bool debug) { if(!use_stdout) { log += msg + "\n"; @@ -291,7 +291,7 @@ void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug) } } -void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg) +void OpenCLDevice::OpenCLProgram::add_error(const string& msg) { if(use_stdout) { fprintf(stderr, "%s\n", msg.c_str()); @@ -302,14 +302,14 @@ void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg) error_msg += msg; } -void OpenCLDeviceBase::OpenCLProgram::add_kernel(ustring name) +void OpenCLDevice::OpenCLProgram::add_kernel(ustring name) { if(!kernels.count(name)) { kernels[name] = NULL; } } -bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src) +bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src) { string build_options; build_options = device->kernel_build_options(debug_src) + kernel_build_options; @@ -341,7 +341,7 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src) return (ciErr == CL_SUCCESS); } -bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src) +bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src) { string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; /* We compile kernels consisting of many files. unfortunately OpenCL @@ -389,14 +389,13 @@ static void escape_python_string(string& str) string_replace(str, "'", "\'"); } -bool OpenCLDeviceBase::OpenCLProgram::compile_separate(const string& clbin) +bool OpenCLDevice::OpenCLProgram::compile_separate(const string& clbin) { vector args; args.push_back("--background"); args.push_back("--factory-startup"); args.push_back("--python-expr"); - const char *force_all_platforms = (DebugFlags().opencl.kernel_type != DebugFlags::OpenCL::KERNEL_DEFAULT)? "true" : "false"; int device_platform_id = device->device_num; string device_name = device->device_name; string platform_name = device->platform_name; @@ -412,8 +411,7 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_separate(const string& clbin) args.push_back( string_printf( - "import _cycles; _cycles.opencl_compile(r'%s', r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')", - force_all_platforms, + "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')", device_platform_id, device_name.c_str(), platform_name.c_str(), @@ -438,20 +436,19 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_separate(const string& clbin) * module compile kernels. Parameters must match function above. */ bool device_opencl_compile_kernel(const vector& parameters) { - bool force_all_platforms = parameters[0] == "true"; - int device_platform_id = std::stoi(parameters[1]); - const string& device_name = parameters[2]; - const string& platform_name = parameters[3]; - const string& build_options = parameters[4]; - const string& kernel_file = parameters[5]; - const string& binary_path = parameters[6]; + int device_platform_id = std::stoi(parameters[0]); + const string& device_name = parameters[1]; + const string& platform_name = parameters[2]; + const string& build_options = parameters[3]; + const string& kernel_file = parameters[4]; + const string& binary_path = parameters[5]; if(clewInit() != CLEW_SUCCESS) { return false; } vector usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices, force_all_platforms); + OpenCLInfo::get_usable_devices(&usable_devices); if(device_platform_id >= usable_devices.size()) { return false; } @@ -504,7 +501,7 @@ bool device_opencl_compile_kernel(const vector& parameters) return result; } -bool OpenCLDeviceBase::OpenCLProgram::load_binary(const string& clbin, +bool OpenCLDevice::OpenCLProgram::load_binary(const string& clbin, const string *debug_src) { /* read binary into memory */ @@ -535,7 +532,7 @@ bool OpenCLDeviceBase::OpenCLProgram::load_binary(const string& clbin, return true; } -bool OpenCLDeviceBase::OpenCLProgram::save_binary(const string& clbin) +bool OpenCLDevice::OpenCLProgram::save_binary(const string& clbin) { size_t size = 0; clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); @@ -551,7 +548,7 @@ bool OpenCLDeviceBase::OpenCLProgram::save_binary(const string& clbin) return path_write_binary(clbin, binary); } -void OpenCLDeviceBase::OpenCLProgram::load() +void OpenCLDevice::OpenCLProgram::load() { assert(device); @@ -642,7 +639,7 @@ void OpenCLDeviceBase::OpenCLProgram::load() loaded = true; } -void OpenCLDeviceBase::OpenCLProgram::report_error() +void OpenCLDevice::OpenCLProgram::report_error() { /* If loaded is true, there was no error. */ if(loaded) return; @@ -656,13 +653,13 @@ void OpenCLDeviceBase::OpenCLProgram::report_error() } } -cl_kernel OpenCLDeviceBase::OpenCLProgram::operator()() +cl_kernel OpenCLDevice::OpenCLProgram::operator()() { assert(kernels.size() == 1); return kernels.begin()->second; } -cl_kernel OpenCLDeviceBase::OpenCLProgram::operator()(ustring name) +cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name) { assert(kernels.count(name)); return kernels[name]; @@ -716,28 +713,6 @@ bool OpenCLInfo::kernel_use_advanced_shading(const string& platform) return false; } -bool OpenCLInfo::kernel_use_split(const string& platform_name, - const cl_device_type device_type) -{ - if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_SPLIT) { - VLOG(1) << "Forcing split kernel to use."; - return true; - } - if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_MEGA) { - VLOG(1) << "Forcing mega kernel to use."; - return false; - } - /* TODO(sergey): Replace string lookups with more enum-like API, - * similar to device/vendor checks blender's gpu. - */ - if(platform_name == "AMD Accelerated Parallel Processing" && - device_type == CL_DEVICE_TYPE_GPU) - { - return true; - } - return false; -} - bool OpenCLInfo::device_supported(const string& platform_name, const cl_device_id device_id) { @@ -878,8 +853,6 @@ string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id dev void OpenCLInfo::get_usable_devices(vector *usable_devices, bool force_all) { - const bool force_all_platforms = force_all || - (DebugFlags().opencl.kernel_type != DebugFlags::OpenCL::KERNEL_DEFAULT); const cl_device_type device_type = OpenCLInfo::device_type(); static bool first_time = true; #define FIRST_VLOG(severity) if(first_time) VLOG(severity) @@ -952,7 +925,7 @@ void OpenCLInfo::get_usable_devices(vector *usable_devices << " due to old compiler version."; continue; } - if(force_all_platforms || + if(force_all || device_supported(platform_name, device_id)) { cl_device_type device_type; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 0a2acd3f669..7332346a787 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -36,8 +36,8 @@ set(SRC_CUDA_KERNELS ) set(SRC_OPENCL_KERNELS - kernels/opencl/kernel.cl kernels/opencl/kernel_bake.cl + kernels/opencl/kernel_base.cl kernels/opencl/kernel_displace.cl kernels/opencl/kernel_background.cl kernels/opencl/kernel_state_buffer_size.cl diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl deleted file mode 100644 index aa837e2ae87..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* OpenCL kernel entry points - unfinished */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/kernel_math.h" -#include "kernel/kernel_types.h" -#include "kernel/kernel_globals.h" -#include "kernel/kernel_color.h" -#include "kernel/kernels/opencl/kernel_opencl_image.h" - -#include "kernel/kernel_film.h" - -#if defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) -# include "kernel/kernel_path.h" -# include "kernel/kernel_path_branched.h" -#else /* __COMPILE_ONLY_MEGAKERNEL__ */ -/* Include only actually used headers for the case - * when path tracing kernels are not needed. - */ -# include "kernel/kernel_random.h" -# include "kernel/kernel_differential.h" -# include "kernel/kernel_montecarlo.h" -# include "kernel/kernel_projection.h" -# include "kernel/geom/geom.h" -# include "kernel/bvh/bvh.h" - -# include "kernel/kernel_accumulate.h" -# include "kernel/kernel_camera.h" -# include "kernel/kernel_shader.h" -#endif /* defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) */ - -#include "kernel/kernel_bake.h" - -#ifdef __COMPILE_ONLY_MEGAKERNEL__ - -__kernel void kernel_ocl_path_trace( - ccl_constant KernelData *data, - ccl_global float *buffer, - - KERNEL_BUFFER_PARAMS, - - int sample, - int sx, int sy, int sw, int sh, int offset, int stride) -{ - KernelGlobals kglobals, *kg = &kglobals; - - kg->data = data; - - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_info(kg); - - int x = sx + ccl_global_id(0); - int y = sy + ccl_global_id(1); - bool thread_is_active = x < sx + sw && y < sy + sh; - if(thread_is_active) { - kernel_path_trace(kg, buffer, sample, x, y, offset, stride); - } - if(kernel_data.film.cryptomatte_passes) { - /* Make sure no thread is writing to the buffers. */ - ccl_barrier(CCL_LOCAL_MEM_FENCE); - if(thread_is_active) { - kernel_cryptomatte_post(kg, buffer, sample, x, y, offset, stride); - } - } -} - -#else /* __COMPILE_ONLY_MEGAKERNEL__ */ - -__kernel void kernel_ocl_convert_to_byte( - ccl_constant KernelData *data, - ccl_global uchar4 *rgba, - ccl_global float *buffer, - - KERNEL_BUFFER_PARAMS, - - float sample_scale, - int sx, int sy, int sw, int sh, int offset, int stride) -{ - KernelGlobals kglobals, *kg = &kglobals; - - kg->data = data; - - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_info(kg); - - int x = sx + ccl_global_id(0); - int y = sy + ccl_global_id(1); - - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); -} - -__kernel void kernel_ocl_convert_to_half_float( - ccl_constant KernelData *data, - ccl_global uchar4 *rgba, - ccl_global float *buffer, - - KERNEL_BUFFER_PARAMS, - - float sample_scale, - int sx, int sy, int sw, int sh, int offset, int stride) -{ - KernelGlobals kglobals, *kg = &kglobals; - - kg->data = data; - - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_info(kg); - - int x = sx + ccl_global_id(0); - int y = sy + ccl_global_id(1); - - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); -} - -__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset) -{ - size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); - - if(i < size / sizeof(float4)) { - buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - else if(i == size / sizeof(float4)) { - ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)]; - - for(i = 0; i < size % sizeof(float4); i++) { - *(b++) = 0; - } - } -} - -#endif /* __COMPILE_ONLY_MEGAKERNEL__ */ diff --git a/intern/cycles/kernel/kernels/opencl/kernel_base.cl b/intern/cycles/kernel/kernels/opencl/kernel_base.cl new file mode 100644 index 00000000000..1c2d89e8a92 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_base.cl @@ -0,0 +1,88 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* OpenCL base kernels entry points */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_globals.h" + +#include "kernel/kernel_film.h" + + +__kernel void kernel_ocl_convert_to_byte( + ccl_constant KernelData *data, + ccl_global uchar4 *rgba, + ccl_global float *buffer, + + KERNEL_BUFFER_PARAMS, + + float sample_scale, + int sx, int sy, int sw, int sh, int offset, int stride) +{ + KernelGlobals kglobals, *kg = &kglobals; + + kg->data = data; + + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); + + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +__kernel void kernel_ocl_convert_to_half_float( + ccl_constant KernelData *data, + ccl_global uchar4 *rgba, + ccl_global float *buffer, + + KERNEL_BUFFER_PARAMS, + + float sample_scale, + int sx, int sy, int sw, int sh, int offset, int stride) +{ + KernelGlobals kglobals, *kg = &kglobals; + + kg->data = data; + + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); + + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset) +{ + size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); + + if(i < size / sizeof(float4)) { + buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + else if(i == size / sizeof(float4)) { + ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)]; + + for(i = 0; i < size % sizeof(float4); i++) { + *(b++) = 0; + } + } +} diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index cd316c46b89..e8929c32394 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -90,7 +90,6 @@ void DebugFlags::CUDA::reset() DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), - kernel_type(DebugFlags::OpenCL::KERNEL_DEFAULT), debug(false), single_program(false) { @@ -122,14 +121,6 @@ void DebugFlags::OpenCL::reset() device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR; } } - /* Initialize kernel type from environment variables. */ - kernel_type = DebugFlags::OpenCL::KERNEL_DEFAULT; - if(getenv("CYCLES_OPENCL_MEGA_KERNEL_TEST") != NULL) { - kernel_type = DebugFlags::OpenCL::KERNEL_MEGA; - } - else if(getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL) { - kernel_type = DebugFlags::OpenCL::KERNEL_SPLIT; - } /* Initialize other flags from environment variables. */ debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL); single_program = (getenv("CYCLES_OPENCL_SINGLE_PROGRAM") != NULL); @@ -164,8 +155,7 @@ std::ostream& operator <<(std::ostream &os, os << "CUDA flags:\n" << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n"; - const char *opencl_device_type, - *opencl_kernel_type; + const char *opencl_device_type; switch(debug_flags.opencl.device_type) { case DebugFlags::OpenCL::DEVICE_NONE: opencl_device_type = "NONE"; @@ -186,20 +176,8 @@ std::ostream& operator <<(std::ostream &os, opencl_device_type = "ACCELERATOR"; break; } - switch(debug_flags.opencl.kernel_type) { - case DebugFlags::OpenCL::KERNEL_DEFAULT: - opencl_kernel_type = "DEFAULT"; - break; - case DebugFlags::OpenCL::KERNEL_MEGA: - opencl_kernel_type = "MEGA"; - break; - case DebugFlags::OpenCL::KERNEL_SPLIT: - opencl_kernel_type = "SPLIT"; - break; - } os << "OpenCL flags:\n" << " Device type : " << opencl_device_type << "\n" - << " Kernel type : " << opencl_kernel_type << "\n" << " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n" << " Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n" << " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n"; diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 864089bb118..9b5eb2f9fbd 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -123,9 +123,6 @@ public: /* Requested device type. */ DeviceType device_type; - /* Requested kernel type. */ - KernelType kernel_type; - /* Use debug version of the kernel. */ bool debug; -- cgit v1.2.3