From a02992f1313811c9905e44dc95a0aee31d707f67 Mon Sep 17 00:00:00 2001 From: Xavier Hallade Date: Wed, 29 Jun 2022 12:58:04 +0200 Subject: Cycles: Add support for rendering on Intel GPUs using oneAPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a new Cycles device with similar functionality to the existing GPU devices. Kernel compilation and runtime interaction happen via oneAPI DPC++ compiler and SYCL API. This implementation is primarly focusing on Intel® Arc™ GPUs and other future Intel GPUs. The first supported drivers are 101.1660 on Windows and 22.10.22597 on Linux. The necessary tools for compilation are: - A SYCL compiler such as oneAPI DPC++ compiler or https://github.com/intel/llvm - Intel® oneAPI Level Zero which is used for low level device queries: https://github.com/oneapi-src/level-zero - To optionally generate prebuilt graphics binaries: Intel® Graphics Compiler All are included in Linux precompiled libraries on svn: https://svn.blender.org/svnroot/bf-blender/trunk/lib The same goes for Windows precompiled binaries but for the graphics compiler, available as "Intel® Graphics Offline Compiler for OpenCL™ Code" from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html, for which path can be set as OCLOC_INSTALL_DIR. Being based on the open SYCL standard, this implementation could also be extended to run on other compatible non-Intel hardware in the future. Reviewed By: sergey, brecht Differential Revision: https://developer.blender.org/D15254 Co-authored-by: Nikita Sirgienko Co-authored-by: Stefan Werner --- intern/cycles/device/oneapi/queue.cpp | 165 ++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 intern/cycles/device/oneapi/queue.cpp (limited to 'intern/cycles/device/oneapi/queue.cpp') diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp new file mode 100644 index 00000000000..42e2408ee7a --- /dev/null +++ b/intern/cycles/device/oneapi/queue.cpp @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_ONEAPI + +# include "device/oneapi/queue.h" +# include "device/oneapi/device_impl.h" +# include "util/log.h" +# include "util/time.h" +# include +# include + +# include "kernel/device/oneapi/kernel.h" + +CCL_NAMESPACE_BEGIN + +struct KernelExecutionInfo { + double elapsed_summary = 0.0; + int enqueue_count = 0; +}; + +/* OneapiDeviceQueue */ + +OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device) + : DeviceQueue(device), + oneapi_device_(device), + oneapi_dll_(device->oneapi_dll_object()), + kernel_context_(nullptr) +{ +} + +OneapiDeviceQueue::~OneapiDeviceQueue() +{ + delete kernel_context_; +} + +int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const +{ + int num_states; + + /* TODO: implement and use get_num_multiprocessors and get_max_num_threads_per_multiprocessor. */ + const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount( + oneapi_device_->sycl_queue()); + if (compute_units >= 128) { + /* dGPU path, make sense to allocate more states, because it will be dedicated GPU memory. */ + int base = 1024 * 1024; + /* linear dependency (with coefficient less that 1) from amount of compute units. */ + num_states = (base * (compute_units / 128)) * 3 / 4; + + /* Limit amount of integrator states by one quarter of device memory, because + * other allocations will need some space as well + * TODO: base this calculation on the how many states what the GPU is actually capable of + * running, with some headroom to improve occupancy. If the texture don't fit, offload into + * unified memory. */ + size_t states_memory_size = num_states * state_size; + size_t device_memory_amount = + (oneapi_dll_.oneapi_get_memcapacity)(oneapi_device_->sycl_queue()); + if (states_memory_size >= device_memory_amount / 4) { + num_states = device_memory_amount / 4 / state_size; + } + } + else { + /* iGPU path - no real need to allocate a lot of integrator states because it is shared GPU + * memory. */ + num_states = 1024 * 512; + } + + VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); + + return num_states; +} + +int OneapiDeviceQueue::num_concurrent_busy_states() const +{ + const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount( + oneapi_device_->sycl_queue()); + if (compute_units >= 128) { + return 1024 * 1024; + } + else { + return 1024 * 512; + } +} + +void OneapiDeviceQueue::init_execution() +{ + oneapi_device_->load_texture_info(); + + SyclQueue *device_queue = oneapi_device_->sycl_queue(); + void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer(); + assert(device_queue); + assert(kg_dptr); + kernel_context_ = new KernelContext{device_queue, kg_dptr}; + + debug_init_execution(); +} + +bool OneapiDeviceQueue::enqueue(DeviceKernel kernel, + const int signed_kernel_work_size, + DeviceKernelArguments const &_args) +{ + if (oneapi_device_->have_error()) { + return false; + } + + void **args = const_cast(_args.values); + + debug_enqueue(kernel, signed_kernel_work_size); + assert(signed_kernel_work_size >= 0); + size_t kernel_work_size = (size_t)signed_kernel_work_size; + + size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size( + kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size); + size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size); + + assert(kernel_context_); + + /* Call the oneAPI kernel DLL to launch the requested kernel. */ + bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel( + kernel_context_, kernel, uniformed_kernel_work_size, args); + + if (is_finished_ok == false) { + oneapi_device_->set_error("oneAPI kernel \"" + std::string(device_kernel_as_string(kernel)) + + "\" execution error: got runtime exception \"" + + oneapi_device_->oneapi_error_message() + "\""); + } + + return is_finished_ok; +} + +bool OneapiDeviceQueue::synchronize() +{ + if (oneapi_device_->have_error()) { + return false; + } + + bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue()); + if (is_finished_ok == false) + oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" + + oneapi_device_->oneapi_error_message() + "\""); + + debug_synchronize(); + + return !(oneapi_device_->have_error()); +} + +void OneapiDeviceQueue::zero_to_device(device_memory &mem) +{ + oneapi_device_->mem_zero(mem); +} + +void OneapiDeviceQueue::copy_to_device(device_memory &mem) +{ + oneapi_device_->mem_copy_to(mem); +} + +void OneapiDeviceQueue::copy_from_device(device_memory &mem) +{ + oneapi_device_->mem_copy_from(mem); +} + +CCL_NAMESPACE_END + +#endif /* WITH_ONEAPI */ -- cgit v1.2.3