Cycles: Add support for rendering on Intel GPUs using oneAPI

This patch adds a new Cycles device with similar functionality to the existing GPU devices. Kernel compilation and runtime interaction happen via oneAPI DPC++ compiler and SYCL API. This implementation is primarly focusing on Intel® Arc™ GPUs and other future Intel GPUs. The first supported drivers are 101.1660 on Windows and 22.10.22597 on Linux. The necessary tools for compilation are: - A SYCL compiler such as oneAPI DPC++ compiler or https://github.com/intel/llvm - Intel® oneAPI Level Zero which is used for low level device queries: https://github.com/oneapi-src/level-zero - To optionally generate prebuilt graphics binaries: Intel® Graphics Compiler All are included in Linux precompiled libraries on svn: https://svn.blender.org/svnroot/bf-blender/trunk/lib The same goes for Windows precompiled binaries but for the graphics compiler, available as "Intel® Graphics Offline Compiler for OpenCL™ Code" from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html, for which path can be set as OCLOC_INSTALL_DIR. Being based on the open SYCL standard, this implementation could also be extended to run on other compatible non-Intel hardware in the future. Reviewed By: sergey, brecht Differential Revision: https://developer.blender.org/D15254 Co-authored-by: Nikita Sirgienko <nikita.sirgienko@intel.com> Co-authored-by: Stefan Werner <stefan.werner@intel.com>
author: Xavier Hallade <xavier.hallade@intel.com> 2022-06-29 13:58:04 +0300
committer: Xavier Hallade <xavier.hallade@intel.com> 2022-06-29 13:58:04 +0300
commit: a02992f1313811c9905e44dc95a0aee31d707f67 (patch)
tree: 2d1f59524e2a298bb530ad578a2d2b9e2c4a1432 /intern/cycles/device
parent: 302b04a5a3fc0e767ac784424f78ce2edf5d2844 (diff)
10 files changed, 1026 insertions, 0 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 6205775260a..6418801c572 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -82,6 +82,15 @@ set(SRC_HIP
   hip/util.h
 )
 
+set(SRC_ONEAPI
+  oneapi/device_impl.cpp
+  oneapi/device_impl.h
+  oneapi/device.cpp
+  oneapi/device.h
+  oneapi/queue.cpp
+  oneapi/queue.h
+)
+
 set(SRC_DUMMY
   dummy/device.cpp
   dummy/device.h
@@ -134,6 +143,7 @@ set(SRC
   ${SRC_DUMMY}
   ${SRC_MULTI}
   ${SRC_OPTIX}
+  ${SRC_ONEAPI}
   ${SRC_HEADERS}
 )
 
@@ -181,6 +191,9 @@ if(WITH_CYCLES_DEVICE_METAL)
     ${SRC_METAL}
   )
 endif()
+if (WITH_CYCLES_DEVICE_ONEAPI)
+  add_definitions(-DWITH_ONEAPI)
+endif()
 
 if(WITH_OPENIMAGEDENOISE)
   list(APPEND LIB
@@ -193,6 +206,11 @@ include_directories(SYSTEM ${INC_SYS})
 
 cycles_add_library(cycles_device "${LIB}" ${SRC})
 
+if(WITH_CYCLES_DEVICE_ONEAPI)
+  # Need to have proper rebuilding in case of changes in cycles_kernel_oneapi due external project behaviour
+  add_dependencies(cycles_device cycles_kernel_oneapi)
+endif()
+
 source_group("cpu" FILES ${SRC_CPU})
 source_group("cuda" FILES ${SRC_CUDA})
 source_group("dummy" FILES ${SRC_DUMMY})
@@ -200,4 +218,5 @@ source_group("hip" FILES ${SRC_HIP})
 source_group("multi" FILES ${SRC_MULTI})
 source_group("metal" FILES ${SRC_METAL})
 source_group("optix" FILES ${SRC_OPTIX})
+source_group("oneapi" FILES ${SRC_ONEAPI})
 source_group("common" FILES ${SRC_BASE} ${SRC_HEADERS})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 82c7881da5f..ace6ed517f5 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -16,6 +16,7 @@
 #include "device/hip/device.h"
 #include "device/metal/device.h"
 #include "device/multi/device.h"
+#include "device/oneapi/device.h"
 #include "device/optix/device.h"
 
 #include "util/foreach.h"
@@ -39,6 +40,7 @@ vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
 vector<DeviceInfo> Device::hip_devices;
 vector<DeviceInfo> Device::metal_devices;
+vector<DeviceInfo> Device::oneapi_devices;
 uint Device::devices_initialized_mask = 0;
 
 /* Device */
@@ -101,6 +103,13 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
         device = device_metal_create(info, stats, profiler);
       break;
 #endif
+
+#ifdef WITH_ONEAPI
+    case DEVICE_ONEAPI:
+      device = device_oneapi_create(info, stats, profiler);
+      break;
+#endif
+
     default:
       break;
   }
@@ -126,6 +135,8 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_HIP;
   else if (strcmp(name, "METAL") == 0)
     return DEVICE_METAL;
+  else if (strcmp(name, "ONEAPI") == 0)
+    return DEVICE_ONEAPI;
 
   return DEVICE_NONE;
 }
@@ -144,6 +155,8 @@ string Device::string_from_type(DeviceType type)
     return "HIP";
   else if (type == DEVICE_METAL)
     return "METAL";
+  else if (type == DEVICE_ONEAPI)
+    return "ONEAPI";
 
   return "";
 }
@@ -164,6 +177,9 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_METAL
   types.push_back(DEVICE_METAL);
 #endif
+#ifdef WITH_ONEAPI
+  types.push_back(DEVICE_ONEAPI);
+#endif
   return types;
 }
 
@@ -219,6 +235,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   }
 #endif
 
+#ifdef WITH_ONEAPI
+  if (mask & DEVICE_MASK_ONEAPI) {
+    if (!(devices_initialized_mask & DEVICE_MASK_ONEAPI)) {
+      if (device_oneapi_init()) {
+        device_oneapi_info(oneapi_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_ONEAPI;
+    }
+    foreach (DeviceInfo &info, oneapi_devices) {
+      devices.push_back(info);
+    }
+  }
+#endif
+
   if (mask & DEVICE_MASK_CPU) {
     if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
       device_cpu_info(cpu_devices);
@@ -282,6 +312,15 @@ string Device::device_capabilities(uint mask)
   }
 #endif
 
+#ifdef WITH_ONEAPI
+  if (mask & DEVICE_MASK_ONEAPI) {
+    if (device_oneapi_init()) {
+      capabilities += "\noneAPI device capabilities:\n";
+      capabilities += device_oneapi_capabilities();
+    }
+  }
+#endif
+
 #ifdef WITH_METAL
   if (mask & DEVICE_MASK_METAL) {
     if (device_metal_init()) {
@@ -380,6 +419,7 @@ void Device::free_memory()
   cuda_devices.free_memory();
   optix_devices.free_memory();
   hip_devices.free_memory();
+  oneapi_devices.free_memory();
   cpu_devices.free_memory();
   metal_devices.free_memory();
 }
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 927caae600c..340be85e853 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -40,6 +40,7 @@ enum DeviceType {
   DEVICE_OPTIX,
   DEVICE_HIP,
   DEVICE_METAL,
+  DEVICE_ONEAPI,
   DEVICE_DUMMY,
 };
 
@@ -49,6 +50,7 @@ enum DeviceTypeMask {
   DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
   DEVICE_MASK_HIP = (1 << DEVICE_HIP),
   DEVICE_MASK_METAL = (1 << DEVICE_METAL),
+  DEVICE_MASK_ONEAPI = (1 << DEVICE_ONEAPI),
   DEVICE_MASK_ALL = ~0
 };
 
@@ -273,6 +275,7 @@ class Device {
   static vector<DeviceInfo> cpu_devices;
   static vector<DeviceInfo> hip_devices;
   static vector<DeviceInfo> metal_devices;
+  static vector<DeviceInfo> oneapi_devices;
   static uint devices_initialized_mask;
 };
 
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
new file mode 100644
index 00000000000..b6f0f0c2b42
--- /dev/null
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#include "device/oneapi/device.h"
+
+#include "util/log.h"
+
+#ifdef WITH_ONEAPI
+#  include "device/device.h"
+#  include "device/oneapi/device_impl.h"
+
+#  include "util/path.h"
+#  include "util/string.h"
+
+#  ifdef __linux__
+#    include <dlfcn.h>
+#  endif
+#endif /* WITH_ONEAPI */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_ONEAPI
+static OneAPIDLLInterface oneapi_dll;
+#endif
+
+#ifdef _WIN32
+#  define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path))
+#  define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle)
+#  define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name)
+#elif __linux__
+#  define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW)
+#  define FREE_SHARED_LIBRARY(handle) dlclose(handle)
+#  define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name)
+#endif
+
+bool device_oneapi_init()
+{
+#if !defined(WITH_ONEAPI)
+  return false;
+#else
+
+  string lib_path = path_get("lib");
+#  ifdef _WIN32
+  lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll");
+#  else
+  lib_path = path_join(lib_path, "cycles_kernel_oneapi.so");
+#  endif
+  void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str());
+
+  /* This shouldn't happen, but it still makes sense to have a branch for this. */
+  if (lib_handle == NULL) {
+    LOG(ERROR) << "oneAPI kernel shared library cannot be loaded for some reason. This should not "
+                  "happen, however, it occurs hence oneAPI rendering will be disabled";
+    return false;
+  }
+
+#  define DLL_INTERFACE_CALL(function, return_type, ...) \
+    (oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \
+        GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \
+    if (oneapi_dll.function == NULL) { \
+      LOG(ERROR) << "oneAPI shared library function \"" << #function \
+                 << "\" has not been loaded from kernel shared  - disable oneAPI " \
+                    "library disable oneAPI implementation due to this"; \
+      FREE_SHARED_LIBRARY(lib_handle); \
+      return false; \
+    }
+#  include "kernel/device/oneapi/dll_interface_template.h"
+#  undef DLL_INTERFACE_CALL
+
+  VLOG_INFO << "oneAPI kernel shared library has been loaded successfully";
+
+  /* We need to have this oneapi kernel shared library during all life-span of the Blender.
+   * So it is not unloaded because of this.
+   * FREE_SHARED_LIBRARY(lib_handle); */
+
+  /* NOTE(@nsirgien): we need to enable JIT cache from here and
+   * right now this cache policy is controlled by env. variables. */
+  /* NOTE(hallade) we also disable use of copy engine as it
+   * improves stability as of intel/llvm sycl-nightly/20220529.
+   * All these env variable can be set beforehand by end-users and
+   * will in that case -not- be overwritten. */
+#  ifdef _WIN32
+  if (getenv("SYCL_CACHE_PERSISTENT") == nullptr) {
+    _putenv_s("SYCL_CACHE_PERSISTENT", "1");
+  }
+  if (getenv("SYCL_CACHE_TRESHOLD") == nullptr) {
+    _putenv_s("SYCL_CACHE_THRESHOLD", "0");
+  }
+  if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
+    _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
+  }
+  if (getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE") == nullptr) {
+    _putenv_s("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0");
+  }
+#  elif __linux__
+  setenv("SYCL_CACHE_PERSISTENT", "1", false);
+  setenv("SYCL_CACHE_THRESHOLD", "0", false);
+  setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
+  setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
+#  endif
+
+  return true;
+#endif
+}
+
+#if defined(_WIN32) || defined(__linux__)
+#  undef LOAD_SYCL_SHARED_LIBRARY
+#  undef LOAD_ONEAPI_SHARED_LIBRARY
+#  undef FREE_SHARED_LIBRARY
+#  undef GET_SHARED_LIBRARY_SYMBOL
+#endif
+
+Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_ONEAPI
+  return new OneapiDevice(info, oneapi_dll, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Requested to create oneAPI device while not enabled for this build.";
+
+  return nullptr;
+#endif
+}
+
+#ifdef WITH_ONEAPI
+static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr)
+{
+  vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr;
+
+  DeviceInfo info;
+
+  info.type = DEVICE_ONEAPI;
+  info.description = name;
+  info.num = num;
+
+  /* NOTE(@nsirgien): Should be unique at least on proper oneapi installation. */
+  info.id = id;
+
+  info.has_nanovdb = true;
+  info.denoisers = 0;
+
+  info.has_gpu_queue = true;
+
+  /* NOTE(@nsirgien): oneAPI right now is focused on one device usage. In future it maybe will
+   * change, but right now peer access from one device to another device is not supported. */
+  info.has_peer_memory = false;
+
+  /* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */
+  info.display_device = false;
+
+  devices->push_back(info);
+  VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+}
+#endif
+
+void device_oneapi_info(vector<DeviceInfo> &devices)
+{
+#ifdef WITH_ONEAPI
+  (oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices);
+#else  /* WITH_ONEAPI */
+  (void)devices;
+#endif /* WITH_ONEAPI */
+}
+
+string device_oneapi_capabilities()
+{
+  string capabilities;
+#ifdef WITH_ONEAPI
+  char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)();
+  if (c_capabilities) {
+    capabilities = c_capabilities;
+    (oneapi_dll.oneapi_free)(c_capabilities);
+  }
+#endif
+  return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/oneapi/device.h b/intern/cycles/device/oneapi/device.h
new file mode 100644
index 00000000000..db8c985d4d5
--- /dev/null
+++ b/intern/cycles/device/oneapi/device.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "util/string.h"
+#include "util/vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_oneapi_init();
+
+Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_oneapi_info(vector<DeviceInfo> &devices);
+
+string device_oneapi_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
new file mode 100644
index 00000000000..8c8ab522b47
--- /dev/null
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+#  include "device/oneapi/device_impl.h"
+
+#  include "util/debug.h"
+#  include "util/log.h"
+
+#  include "kernel/device/oneapi/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+static void queue_error_cb(const char *message, void *user_ptr)
+{
+  if (user_ptr) {
+    *reinterpret_cast<std::string *>(user_ptr) = message;
+  }
+}
+
+OneapiDevice::OneapiDevice(const DeviceInfo &info,
+                           OneAPIDLLInterface &oneapi_dll_object,
+                           Stats &stats,
+                           Profiler &profiler)
+    : Device(info, stats, profiler),
+      device_queue_(nullptr),
+      texture_info_(this, "texture_info", MEM_GLOBAL),
+      kg_memory_(nullptr),
+      kg_memory_device_(nullptr),
+      kg_memory_size_(0),
+      oneapi_dll_(oneapi_dll_object)
+{
+  need_texture_info_ = false;
+
+  oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
+
+  /* Oneapi calls should be initialised on this moment. */
+  assert(oneapi_dll_.oneapi_create_queue != nullptr);
+
+  bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num);
+  if (is_finished_ok == false) {
+    set_error("oneAPI queue initialization error: got runtime exception \"" +
+              oneapi_error_string_ + "\"");
+  }
+  else {
+    VLOG_DEBUG << "oneAPI queue has been successfully created for the device \""
+               << info.description << "\"";
+    assert(device_queue_);
+  }
+
+  size_t globals_segment_size;
+  is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size);
+  if (is_finished_ok == false) {
+    set_error("oneAPI constant memory initialization got runtime exception \"" +
+              oneapi_error_string_ + "\"");
+  }
+  else {
+    VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
+  }
+
+  kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
+  oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
+
+  kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size);
+
+  kg_memory_size_ = globals_segment_size;
+}
+
+OneapiDevice::~OneapiDevice()
+{
+  texture_info_.free();
+  oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_);
+  oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_);
+
+  for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++)
+    delete mt->second;
+
+  if (device_queue_)
+    oneapi_dll_.oneapi_free_queue(device_queue_);
+}
+
+bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
+{
+  return false;
+}
+
+BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+bool OneapiDevice::load_kernels(const uint requested_features)
+{
+  assert(device_queue_);
+  /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with sertain feature set
+   * with specialization constants, but it hasn't been implemented yet. */
+  (void)requested_features;
+
+  bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_);
+  if (is_finished_ok == false) {
+    set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
+  }
+  else {
+    VLOG_INFO << "Runtime compilation done for \"" << info.description << "\"";
+    assert(device_queue_);
+  }
+  return is_finished_ok;
+}
+
+void OneapiDevice::load_texture_info()
+{
+  if (need_texture_info_) {
+    need_texture_info_ = false;
+    texture_info_.copy_to_device();
+  }
+}
+
+void OneapiDevice::generic_alloc(device_memory &mem)
+{
+  size_t memory_size = mem.memory_size();
+
+  /* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then
+   * we can use USM host memory.
+   * Because of the expected performance impact, implementation of this has had a low priority
+   * and is not implemented yet. */
+
+  assert(device_queue_);
+  /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
+   * and shared. For new project it maybe more beneficial to use USM shared memory, because it
+   * provides automatic migration mechanism in order to allow to use the same pointer on host and
+   * on device, without need to worry about explicit memory transfer operations. But for
+   * Blender/Cycles this type of memory is not very suitable in current application architecture,
+   * because Cycles already uses two different pointer for host activity and device activity, and
+   * also has to perform all needed memory transfer operations. So, USM device memory
+   * type has been used for oneAPI device in order to better fit in Cycles architecture. */
+  void *device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size);
+  if (device_pointer == nullptr) {
+    size_t max_memory_on_device = oneapi_dll_.oneapi_get_memcapacity(device_queue_);
+    set_error("oneAPI kernel - device memory allocation error for " +
+              string_human_readable_size(mem.memory_size()) +
+              ", possibly caused by lack of available memory space on the device: " +
+              string_human_readable_size(stats.mem_used) + " of " +
+              string_human_readable_size(max_memory_on_device) + " is already allocated");
+    return;
+  }
+  assert(device_pointer);
+
+  mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer);
+  mem.device_size = memory_size;
+
+  stats.mem_alloc(memory_size);
+}
+
+void OneapiDevice::generic_copy_to(device_memory &mem)
+{
+  size_t memory_size = mem.memory_size();
+
+  /* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
+  assert(mem.host_pointer);
+  assert(device_queue_);
+  oneapi_dll_.oneapi_usm_memcpy(
+      device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
+}
+
+/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
+SyclQueue *OneapiDevice::sycl_queue()
+{
+  return device_queue_;
+}
+
+string OneapiDevice::oneapi_error_message()
+{
+  return string(oneapi_error_string_);
+}
+
+OneAPIDLLInterface OneapiDevice::oneapi_dll_object()
+{
+  return oneapi_dll_;
+}
+
+void *OneapiDevice::kernel_globals_device_pointer()
+{
+  return kg_memory_device_;
+}
+
+void OneapiDevice::generic_free(device_memory &mem)
+{
+  assert(mem.device_pointer);
+  stats.mem_free(mem.device_size);
+  mem.device_size = 0;
+
+  assert(device_queue_);
+  oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer);
+  mem.device_pointer = 0;
+}
+
+void OneapiDevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    if (mem.name) {
+      VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", "
+                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
+                 << string_human_readable_size(mem.memory_size()) << ")";
+    }
+    generic_alloc(mem);
+  }
+}
+
+void OneapiDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.name) {
+    VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", "
+               << string_human_readable_number(mem.memory_size()) << " bytes. ("
+               << string_human_readable_size(mem.memory_size()) << ")";
+  }
+
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer)
+      mem_alloc(mem);
+
+    generic_copy_to(mem);
+  }
+}
+
+void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
+{
+  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size();
+    const size_t offset = elem * y * w;
+
+    if (mem.name) {
+      VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of "
+                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
+                 << string_human_readable_size(mem.memory_size()) << ") from offset " << offset
+                 << " data " << size << " bytes";
+    }
+
+    assert(device_queue_);
+
+    assert(size != 0);
+    assert(mem.device_pointer);
+    char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
+    char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
+    bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy(
+        device_queue_, shifted_host, shifted_device, size);
+    if (is_finished_ok == false) {
+      set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
+                "\"");
+    }
+  }
+}
+
+void OneapiDevice::mem_zero(device_memory &mem)
+{
+  if (mem.name) {
+    VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", "
+               << string_human_readable_number(mem.memory_size()) << " bytes. ("
+               << string_human_readable_size(mem.memory_size()) << ")\n";
+  }
+
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  assert(device_queue_);
+  bool is_finished_ok = oneapi_dll_.oneapi_usm_memset(
+      device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
+  if (is_finished_ok == false) {
+    set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
+              "\"");
+  }
+}
+
+void OneapiDevice::mem_free(device_memory &mem)
+{
+  if (mem.name) {
+    VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", "
+               << string_human_readable_number(mem.device_size) << " bytes. ("
+               << string_human_readable_size(mem.device_size) << ")\n";
+  }
+
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
+{
+  return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) +
+                                      mem.memory_elements_size(offset));
+}
+
+void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  assert(name);
+
+  VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object "
+             << string_human_readable_number(size) << " bytes. ("
+             << string_human_readable_size(size) << ")";
+
+  ConstMemMap::iterator i = const_mem_map_.find(name);
+  device_vector<uchar> *data;
+
+  if (i == const_mem_map_.end()) {
+    data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
+    data->alloc(size);
+    const_mem_map_.insert(ConstMemMap::value_type(name, data));
+  }
+  else {
+    data = i->second;
+  }
+
+  assert(data->memory_size() <= size);
+  memcpy(data->data(), host, size);
+  data->copy_to_device();
+
+  oneapi_dll_.oneapi_set_global_memory(
+      device_queue_, kg_memory_, name, (void *)data->device_pointer);
+
+  oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+}
+
+void OneapiDevice::global_alloc(device_memory &mem)
+{
+  assert(mem.name);
+
+  size_t size = mem.memory_size();
+  VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object "
+             << string_human_readable_number(size) << " bytes. ("
+             << string_human_readable_size(size) << ")";
+
+  generic_alloc(mem);
+  generic_copy_to(mem);
+
+  oneapi_dll_.oneapi_set_global_memory(
+      device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
+
+  oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+}
+
+void OneapiDevice::global_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void OneapiDevice::tex_alloc(device_texture &mem)
+{
+  generic_alloc(mem);
+  generic_copy_to(mem);
+
+  /* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */
+  const uint slot = mem.slot;
+  if (slot >= texture_info_.size()) {
+    texture_info_.resize(slot + 128);
+  }
+
+  texture_info_[slot] = mem.info;
+  need_texture_info_ = true;
+
+  texture_info_[slot].data = (uint64_t)mem.device_pointer;
+}
+
+void OneapiDevice::tex_free(device_texture &mem)
+{
+  /* There is no texture memory in SYCL. */
+  if (mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
+{
+  return make_unique<OneapiDeviceQueue>(this);
+}
+
+bool OneapiDevice::should_use_graphics_interop()
+{
+  /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
+   * return false. */
+  return false;
+}
+
+void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment)
+{
+  assert(device_queue_);
+  return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment);
+}
+
+void OneapiDevice::usm_free(void *usm_ptr)
+{
+  assert(device_queue_);
+  return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
new file mode 100644
index 00000000000..f925687ebe9
--- /dev/null
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+#  include "device/device.h"
+#  include "device/oneapi/device.h"
+#  include "device/oneapi/queue.h"
+
+#  include "util/map.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class OneapiDevice : public Device {
+ private:
+  SyclQueue *device_queue_;
+
+  using ConstMemMap = map<string, device_vector<uchar> *>;
+  ConstMemMap const_mem_map_;
+  device_vector<TextureInfo> texture_info_;
+  bool need_texture_info_;
+  void *kg_memory_;
+  void *kg_memory_device_;
+  size_t kg_memory_size_ = (size_t)0;
+  OneAPIDLLInterface oneapi_dll_;
+  std::string oneapi_error_string_;
+
+ public:
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  OneapiDevice(const DeviceInfo &info,
+               OneAPIDLLInterface &oneapi_dll_object,
+               Stats &stats,
+               Profiler &profiler);
+
+  virtual ~OneapiDevice();
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool load_kernels(const uint requested_features) override;
+
+  void load_texture_info();
+
+  void generic_alloc(device_memory &mem);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  SyclQueue *sycl_queue();
+
+  string oneapi_error_message();
+
+  OneAPIDLLInterface oneapi_dll_object();
+
+  void *kernel_globals_device_pointer();
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
+
+  void mem_copy_from(device_memory &mem)
+  {
+    mem_copy_from(mem, 0, 0, 0, 0);
+  }
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  /* Graphics resources interoperability. */
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  /* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
+   * side compilation (MSVC). */
+  void *usm_aligned_alloc_host(size_t memory_size, size_t alignment);
+  void usm_free(void *usm_ptr);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/oneapi/dll_interface.h b/intern/cycles/device/oneapi/dll_interface.h
new file mode 100644
index 00000000000..bc681ff8f64
--- /dev/null
+++ b/intern/cycles/device/oneapi/dll_interface.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+/* Include kernel header to get access to sycl-specific types, like SyclQueue and
+ * OneAPIDeviceIteratorCallback. */
+#include "kernel/device/oneapi/kernel.h"
+
+#ifdef WITH_ONEAPI
+struct OneAPIDLLInterface {
+#  define DLL_INTERFACE_CALL(function, return_type, ...) \
+    return_type (*function)(__VA_ARGS__) = nullptr;
+#  include "kernel/device/oneapi/dll_interface_template.h"
+#  undef DLL_INTERFACE_CALL
+};
+#endif
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
new file mode 100644
index 00000000000..42e2408ee7a
--- /dev/null
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+#  include "device/oneapi/queue.h"
+#  include "device/oneapi/device_impl.h"
+#  include "util/log.h"
+#  include "util/time.h"
+#  include <iomanip>
+#  include <vector>
+
+#  include "kernel/device/oneapi/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelExecutionInfo {
+  double elapsed_summary = 0.0;
+  int enqueue_count = 0;
+};
+
+/* OneapiDeviceQueue */
+
+OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device)
+    : DeviceQueue(device),
+      oneapi_device_(device),
+      oneapi_dll_(device->oneapi_dll_object()),
+      kernel_context_(nullptr)
+{
+}
+
+OneapiDeviceQueue::~OneapiDeviceQueue()
+{
+  delete kernel_context_;
+}
+
+int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+  int num_states;
+
+  /* TODO: implement and use get_num_multiprocessors and get_max_num_threads_per_multiprocessor. */
+  const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
+      oneapi_device_->sycl_queue());
+  if (compute_units >= 128) {
+    /* dGPU path, make sense to allocate more states, because it will be dedicated GPU memory. */
+    int base = 1024 * 1024;
+    /* linear dependency (with coefficient less that 1) from amount of compute units. */
+    num_states = (base * (compute_units / 128)) * 3 / 4;
+
+    /* Limit amount of integrator states by one quarter of device memory, because
+     * other allocations will need some space as well
+     * TODO: base this calculation on the how many states what the GPU is actually capable of
+     * running, with some headroom to improve occupancy. If the texture don't fit, offload into
+     * unified memory. */
+    size_t states_memory_size = num_states * state_size;
+    size_t device_memory_amount =
+        (oneapi_dll_.oneapi_get_memcapacity)(oneapi_device_->sycl_queue());
+    if (states_memory_size >= device_memory_amount / 4) {
+      num_states = device_memory_amount / 4 / state_size;
+    }
+  }
+  else {
+    /* iGPU path - no real need to allocate a lot of integrator states because it is shared GPU
+     * memory. */
+    num_states = 1024 * 512;
+  }
+
+  VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
+                    << string_human_readable_size(num_states * state_size);
+
+  return num_states;
+}
+
+int OneapiDeviceQueue::num_concurrent_busy_states() const
+{
+  const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
+      oneapi_device_->sycl_queue());
+  if (compute_units >= 128) {
+    return 1024 * 1024;
+  }
+  else {
+    return 1024 * 512;
+  }
+}
+
+void OneapiDeviceQueue::init_execution()
+{
+  oneapi_device_->load_texture_info();
+
+  SyclQueue *device_queue = oneapi_device_->sycl_queue();
+  void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer();
+  assert(device_queue);
+  assert(kg_dptr);
+  kernel_context_ = new KernelContext{device_queue, kg_dptr};
+
+  debug_init_execution();
+}
+
+bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
+                                const int signed_kernel_work_size,
+                                DeviceKernelArguments const &_args)
+{
+  if (oneapi_device_->have_error()) {
+    return false;
+  }
+
+  void **args = const_cast<void **>(_args.values);
+
+  debug_enqueue(kernel, signed_kernel_work_size);
+  assert(signed_kernel_work_size >= 0);
+  size_t kernel_work_size = (size_t)signed_kernel_work_size;
+
+  size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size(
+      kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
+  size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
+
+  assert(kernel_context_);
+
+  /* Call the oneAPI kernel DLL to launch the requested kernel. */
+  bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel(
+      kernel_context_, kernel, uniformed_kernel_work_size, args);
+
+  if (is_finished_ok == false) {
+    oneapi_device_->set_error("oneAPI kernel \"" + std::string(device_kernel_as_string(kernel)) +
+                              "\" execution error: got runtime exception \"" +
+                              oneapi_device_->oneapi_error_message() + "\"");
+  }
+
+  return is_finished_ok;
+}
+
+bool OneapiDeviceQueue::synchronize()
+{
+  if (oneapi_device_->have_error()) {
+    return false;
+  }
+
+  bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue());
+  if (is_finished_ok == false)
+    oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" +
+                              oneapi_device_->oneapi_error_message() + "\"");
+
+  debug_synchronize();
+
+  return !(oneapi_device_->have_error());
+}
+
+void OneapiDeviceQueue::zero_to_device(device_memory &mem)
+{
+  oneapi_device_->mem_zero(mem);
+}
+
+void OneapiDeviceQueue::copy_to_device(device_memory &mem)
+{
+  oneapi_device_->mem_copy_to(mem);
+}
+
+void OneapiDeviceQueue::copy_from_device(device_memory &mem)
+{
+  oneapi_device_->mem_copy_from(mem);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
new file mode 100644
index 00000000000..09a015303b6
--- /dev/null
+++ b/intern/cycles/device/oneapi/queue.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#ifdef WITH_ONEAPI
+
+#  include "device/kernel.h"
+#  include "device/memory.h"
+#  include "device/queue.h"
+
+#  include "device/oneapi/device.h"
+#  include "device/oneapi/dll_interface.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OneapiDevice;
+class device_memory;
+
+/* Base class for Oneapi queues. */
+class OneapiDeviceQueue : public DeviceQueue {
+ public:
+  explicit OneapiDeviceQueue(OneapiDevice *device);
+  ~OneapiDeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool enqueue(DeviceKernel kernel,
+                       const int kernel_work_size,
+                       DeviceKernelArguments const &args) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+ protected:
+  OneapiDevice *oneapi_device_;
+  OneAPIDLLInterface oneapi_dll_;
+  KernelContext *kernel_context_;
+  bool with_kernel_statistics_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_ONEAPI */
author	Xavier Hallade <xavier.hallade@intel.com>	2022-06-29 13:58:04 +0300
committer	Xavier Hallade <xavier.hallade@intel.com>	2022-06-29 13:58:04 +0300
commit	a02992f1313811c9905e44dc95a0aee31d707f67 (patch)
tree	2d1f59524e2a298bb530ad578a2d2b9e2c4a1432 /intern/cycles/device
parent	302b04a5a3fc0e767ac784424f78ce2edf5d2844 (diff)