diff options
author | Xavier Hallade <xavier.hallade@intel.com> | 2022-06-29 13:58:04 +0300 |
---|---|---|
committer | Xavier Hallade <xavier.hallade@intel.com> | 2022-06-29 13:58:04 +0300 |
commit | a02992f1313811c9905e44dc95a0aee31d707f67 (patch) | |
tree | 2d1f59524e2a298bb530ad578a2d2b9e2c4a1432 /intern | |
parent | 302b04a5a3fc0e767ac784424f78ce2edf5d2844 (diff) |
Cycles: Add support for rendering on Intel GPUs using oneAPI
This patch adds a new Cycles device with similar functionality to the
existing GPU devices. Kernel compilation and runtime interaction happen
via oneAPI DPC++ compiler and SYCL API.
This implementation is primarly focusing on Intel® Arc™ GPUs and other
future Intel GPUs. The first supported drivers are 101.1660 on Windows
and 22.10.22597 on Linux.
The necessary tools for compilation are:
- A SYCL compiler such as oneAPI DPC++ compiler or
https://github.com/intel/llvm
- Intel® oneAPI Level Zero which is used for low level device queries:
https://github.com/oneapi-src/level-zero
- To optionally generate prebuilt graphics binaries: Intel® Graphics
Compiler All are included in Linux precompiled libraries on svn:
https://svn.blender.org/svnroot/bf-blender/trunk/lib The same goes for
Windows precompiled binaries but for the graphics compiler, available
as "Intel® Graphics Offline Compiler for OpenCL™ Code" from
https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html,
for which path can be set as OCLOC_INSTALL_DIR.
Being based on the open SYCL standard, this implementation could also be
extended to run on other compatible non-Intel hardware in the future.
Reviewed By: sergey, brecht
Differential Revision: https://developer.blender.org/D15254
Co-authored-by: Nikita Sirgienko <nikita.sirgienko@intel.com>
Co-authored-by: Stefan Werner <stefan.werner@intel.com>
Diffstat (limited to 'intern')
61 files changed, 3379 insertions, 74 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index f5d717e70fc..82fd81be262 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -263,6 +263,10 @@ if(WITH_CYCLES_DEVICE_OPTIX) endif() endif() +if (WITH_CYCLES_DEVICE_ONEAPI) + add_definitions(-DWITH_ONEAPI) +endif() + if(WITH_CYCLES_EMBREE) add_definitions(-DWITH_EMBREE) include_directories( diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 17f05f6da34..7d7ca78c15a 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -118,7 +118,8 @@ enum_device_type = ( ('CUDA', "CUDA", "CUDA", 1), ('OPTIX', "OptiX", "OptiX", 3), ('HIP', "HIP", "HIP", 4), - ('METAL', "Metal", "Metal", 5) + ('METAL', "Metal", "Metal", 5), + ('ONEAPI', "oneAPI", "oneAPI", 6) ) enum_texture_limit = ( @@ -1397,7 +1398,8 @@ class CyclesPreferences(bpy.types.AddonPreferences): def get_device_types(self, context): import _cycles - has_cuda, has_optix, has_hip, has_metal = _cycles.get_device_types() + has_cuda, has_optix, has_hip, has_metal, has_oneapi = _cycles.get_device_types() + list = [('NONE', "None", "Don't use compute device", 0)] if has_cuda: list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1)) @@ -1407,6 +1409,8 @@ class CyclesPreferences(bpy.types.AddonPreferences): list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4)) if has_metal: list.append(('METAL', "Metal", "Use Metal for GPU acceleration", 5)) + if has_oneapi: + list.append(('ONEAPI', "oneAPI", "Use oneAPI for GPU acceleration", 6)) return list @@ -1438,7 +1442,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def update_device_entries(self, device_list): for device in device_list: - if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL'}: + if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL', 'ONEAPI'}: continue # Try to find existing Device entry entry = self.find_existing_device_entry(device) @@ -1482,7 +1486,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): import _cycles # Ensure `self.devices` is not re-allocated when the second call to # get_devices_for_type is made, freeing items from the first list. - for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL'): + for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL', 'ONEAPI'): self.update_device_entries(_cycles.available_devices(device_type)) # Deprecated: use refresh_devices instead. @@ -1550,13 +1554,25 @@ class CyclesPreferences(bpy.types.AddonPreferences): elif sys.platform.startswith("linux"): col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1') col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1') + elif device_type == 'ONEAPI': + import sys + col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1') + if sys.platform.startswith("win"): + col.label(text="and Windows driver version 101.1660 or newer", icon='BLANK1') + elif sys.platform.startswith("linux"): + col.label(text="and Linux driver version xx.xx.20066 or newer", icon='BLANK1') elif device_type == 'METAL': col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1') col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1') return for device in devices: - box.prop(device, "use", text=device.name) + import unicodedata + box.prop(device, "use", text=device.name + .replace('(TM)', unicodedata.lookup('TRADE MARK SIGN')) + .replace('(R)', unicodedata.lookup('REGISTERED SIGN')) + .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN')) + ) def draw_impl(self, layout, context): row = layout.row() diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 88be546746d..5b8c3960c82 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -110,6 +110,10 @@ def use_optix(context): return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU') +def use_oneapi(context): + cscene = context.scene.cycles + + return (get_device_type(context) == 'ONEAPI' and cscene.device == 'GPU') def use_multi_device(context): cscene = context.scene.cycles diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp index 38effa329a5..22beca898f1 100644 --- a/intern/cycles/blender/device.cpp +++ b/intern/cycles/blender/device.cpp @@ -15,6 +15,7 @@ enum ComputeDevice { COMPUTE_DEVICE_OPTIX = 3, COMPUTE_DEVICE_HIP = 4, COMPUTE_DEVICE_METAL = 5, + COMPUTE_DEVICE_ONEAPI = 6, COMPUTE_DEVICE_NUM }; @@ -76,6 +77,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen else if (compute_device == COMPUTE_DEVICE_METAL) { mask |= DEVICE_MASK_METAL; } + else if (compute_device == COMPUTE_DEVICE_ONEAPI) { + mask |= DEVICE_MASK_ONEAPI; + } vector<DeviceInfo> devices = Device::available_devices(mask); /* Match device preferences and available devices. */ diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp index 7bd1ad2cafe..8b2b331f73e 100644 --- a/intern/cycles/blender/python.cpp +++ b/intern/cycles/blender/python.cpp @@ -871,18 +871,20 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args* static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector<DeviceType> device_types = Device::available_types(); - bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false; + bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false, has_oneapi = false; foreach (DeviceType device_type, device_types) { has_cuda |= (device_type == DEVICE_CUDA); has_optix |= (device_type == DEVICE_OPTIX); has_hip |= (device_type == DEVICE_HIP); has_metal |= (device_type == DEVICE_METAL); + has_oneapi |= (device_type == DEVICE_ONEAPI); } - PyObject *list = PyTuple_New(4); + PyObject *list = PyTuple_New(5); PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda)); PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix)); PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip)); PyTuple_SET_ITEM(list, 3, PyBool_FromLong(has_metal)); + PyTuple_SET_ITEM(list, 4, PyBool_FromLong(has_oneapi)); return list; } @@ -914,6 +916,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg) else if (override == "METAL") { BlenderSession::device_override = DEVICE_MASK_METAL; } + else if (override == "ONEAPI") { + BlenderSession::device_override = DEVICE_MASK_ONEAPI; + } else { printf("\nError: %s is not a valid Cycles device.\n", override.c_str()); Py_RETURN_FALSE; diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index d2f30fe764b..51830250f2e 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -91,6 +91,8 @@ if(CYCLES_STANDALONE_REPOSITORY) _set_default(USD_ROOT_DIR "${_cycles_lib_dir}/usd") _set_default(WEBP_ROOT_DIR "${_cycles_lib_dir}/webp") _set_default(ZLIB_ROOT "${_cycles_lib_dir}/zlib") + _set_default(LEVEL_ZERO_ROOT_DIR "${_cycles_lib_dir}/level-zero") + _set_default(SYCL_ROOT_DIR "${_cycles_lib_dir}/dpcpp") # Ignore system libraries set(CMAKE_IGNORE_PATH "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES};${CMAKE_SYSTEM_INCLUDE_PATH};${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES};${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}") @@ -647,3 +649,22 @@ if(WITH_CYCLES_DEVICE_METAL) message(STATUS "Found Metal: ${METAL_LIBRARY}") endif() endif() + +########################################################################### +# oneAPI +########################################################################### + +if (WITH_CYCLES_DEVICE_ONEAPI) + find_package(SYCL) + find_package(LevelZero) + + if (SYCL_FOUND AND LEVEL_ZERO_FOUND) + message(STATUS "Found oneAPI: ${SYCL_LIBRARY}") + message(STATUS "Found Level Zero: ${LEVEL_ZERO_LIBRARY}") + else() + message(STATUS "oneAPI or Level Zero not found, disabling oneAPI device from Cycles") + set(WITH_CYCLES_DEVICE_ONEAPI OFF) + endif() +endif() + +unset(_cycles_lib_dir) diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 6205775260a..6418801c572 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -82,6 +82,15 @@ set(SRC_HIP hip/util.h ) +set(SRC_ONEAPI + oneapi/device_impl.cpp + oneapi/device_impl.h + oneapi/device.cpp + oneapi/device.h + oneapi/queue.cpp + oneapi/queue.h +) + set(SRC_DUMMY dummy/device.cpp dummy/device.h @@ -134,6 +143,7 @@ set(SRC ${SRC_DUMMY} ${SRC_MULTI} ${SRC_OPTIX} + ${SRC_ONEAPI} ${SRC_HEADERS} ) @@ -181,6 +191,9 @@ if(WITH_CYCLES_DEVICE_METAL) ${SRC_METAL} ) endif() +if (WITH_CYCLES_DEVICE_ONEAPI) + add_definitions(-DWITH_ONEAPI) +endif() if(WITH_OPENIMAGEDENOISE) list(APPEND LIB @@ -193,6 +206,11 @@ include_directories(SYSTEM ${INC_SYS}) cycles_add_library(cycles_device "${LIB}" ${SRC}) +if(WITH_CYCLES_DEVICE_ONEAPI) + # Need to have proper rebuilding in case of changes in cycles_kernel_oneapi due external project behaviour + add_dependencies(cycles_device cycles_kernel_oneapi) +endif() + source_group("cpu" FILES ${SRC_CPU}) source_group("cuda" FILES ${SRC_CUDA}) source_group("dummy" FILES ${SRC_DUMMY}) @@ -200,4 +218,5 @@ source_group("hip" FILES ${SRC_HIP}) source_group("multi" FILES ${SRC_MULTI}) source_group("metal" FILES ${SRC_METAL}) source_group("optix" FILES ${SRC_OPTIX}) +source_group("oneapi" FILES ${SRC_ONEAPI}) source_group("common" FILES ${SRC_BASE} ${SRC_HEADERS}) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 82c7881da5f..ace6ed517f5 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -16,6 +16,7 @@ #include "device/hip/device.h" #include "device/metal/device.h" #include "device/multi/device.h" +#include "device/oneapi/device.h" #include "device/optix/device.h" #include "util/foreach.h" @@ -39,6 +40,7 @@ vector<DeviceInfo> Device::optix_devices; vector<DeviceInfo> Device::cpu_devices; vector<DeviceInfo> Device::hip_devices; vector<DeviceInfo> Device::metal_devices; +vector<DeviceInfo> Device::oneapi_devices; uint Device::devices_initialized_mask = 0; /* Device */ @@ -101,6 +103,13 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler) device = device_metal_create(info, stats, profiler); break; #endif + +#ifdef WITH_ONEAPI + case DEVICE_ONEAPI: + device = device_oneapi_create(info, stats, profiler); + break; +#endif + default: break; } @@ -126,6 +135,8 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_HIP; else if (strcmp(name, "METAL") == 0) return DEVICE_METAL; + else if (strcmp(name, "ONEAPI") == 0) + return DEVICE_ONEAPI; return DEVICE_NONE; } @@ -144,6 +155,8 @@ string Device::string_from_type(DeviceType type) return "HIP"; else if (type == DEVICE_METAL) return "METAL"; + else if (type == DEVICE_ONEAPI) + return "ONEAPI"; return ""; } @@ -164,6 +177,9 @@ vector<DeviceType> Device::available_types() #ifdef WITH_METAL types.push_back(DEVICE_METAL); #endif +#ifdef WITH_ONEAPI + types.push_back(DEVICE_ONEAPI); +#endif return types; } @@ -219,6 +235,20 @@ vector<DeviceInfo> Device::available_devices(uint mask) } #endif +#ifdef WITH_ONEAPI + if (mask & DEVICE_MASK_ONEAPI) { + if (!(devices_initialized_mask & DEVICE_MASK_ONEAPI)) { + if (device_oneapi_init()) { + device_oneapi_info(oneapi_devices); + } + devices_initialized_mask |= DEVICE_MASK_ONEAPI; + } + foreach (DeviceInfo &info, oneapi_devices) { + devices.push_back(info); + } + } +#endif + if (mask & DEVICE_MASK_CPU) { if (!(devices_initialized_mask & DEVICE_MASK_CPU)) { device_cpu_info(cpu_devices); @@ -282,6 +312,15 @@ string Device::device_capabilities(uint mask) } #endif +#ifdef WITH_ONEAPI + if (mask & DEVICE_MASK_ONEAPI) { + if (device_oneapi_init()) { + capabilities += "\noneAPI device capabilities:\n"; + capabilities += device_oneapi_capabilities(); + } + } +#endif + #ifdef WITH_METAL if (mask & DEVICE_MASK_METAL) { if (device_metal_init()) { @@ -380,6 +419,7 @@ void Device::free_memory() cuda_devices.free_memory(); optix_devices.free_memory(); hip_devices.free_memory(); + oneapi_devices.free_memory(); cpu_devices.free_memory(); metal_devices.free_memory(); } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 927caae600c..340be85e853 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -40,6 +40,7 @@ enum DeviceType { DEVICE_OPTIX, DEVICE_HIP, DEVICE_METAL, + DEVICE_ONEAPI, DEVICE_DUMMY, }; @@ -49,6 +50,7 @@ enum DeviceTypeMask { DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX), DEVICE_MASK_HIP = (1 << DEVICE_HIP), DEVICE_MASK_METAL = (1 << DEVICE_METAL), + DEVICE_MASK_ONEAPI = (1 << DEVICE_ONEAPI), DEVICE_MASK_ALL = ~0 }; @@ -273,6 +275,7 @@ class Device { static vector<DeviceInfo> cpu_devices; static vector<DeviceInfo> hip_devices; static vector<DeviceInfo> metal_devices; + static vector<DeviceInfo> oneapi_devices; static uint devices_initialized_mask; }; diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp new file mode 100644 index 00000000000..b6f0f0c2b42 --- /dev/null +++ b/intern/cycles/device/oneapi/device.cpp @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#include "device/oneapi/device.h" + +#include "util/log.h" + +#ifdef WITH_ONEAPI +# include "device/device.h" +# include "device/oneapi/device_impl.h" + +# include "util/path.h" +# include "util/string.h" + +# ifdef __linux__ +# include <dlfcn.h> +# endif +#endif /* WITH_ONEAPI */ + +CCL_NAMESPACE_BEGIN + +#ifdef WITH_ONEAPI +static OneAPIDLLInterface oneapi_dll; +#endif + +#ifdef _WIN32 +# define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path)) +# define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle) +# define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name) +#elif __linux__ +# define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW) +# define FREE_SHARED_LIBRARY(handle) dlclose(handle) +# define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name) +#endif + +bool device_oneapi_init() +{ +#if !defined(WITH_ONEAPI) + return false; +#else + + string lib_path = path_get("lib"); +# ifdef _WIN32 + lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll"); +# else + lib_path = path_join(lib_path, "cycles_kernel_oneapi.so"); +# endif + void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str()); + + /* This shouldn't happen, but it still makes sense to have a branch for this. */ + if (lib_handle == NULL) { + LOG(ERROR) << "oneAPI kernel shared library cannot be loaded for some reason. This should not " + "happen, however, it occurs hence oneAPI rendering will be disabled"; + return false; + } + +# define DLL_INTERFACE_CALL(function, return_type, ...) \ + (oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \ + GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \ + if (oneapi_dll.function == NULL) { \ + LOG(ERROR) << "oneAPI shared library function \"" << #function \ + << "\" has not been loaded from kernel shared - disable oneAPI " \ + "library disable oneAPI implementation due to this"; \ + FREE_SHARED_LIBRARY(lib_handle); \ + return false; \ + } +# include "kernel/device/oneapi/dll_interface_template.h" +# undef DLL_INTERFACE_CALL + + VLOG_INFO << "oneAPI kernel shared library has been loaded successfully"; + + /* We need to have this oneapi kernel shared library during all life-span of the Blender. + * So it is not unloaded because of this. + * FREE_SHARED_LIBRARY(lib_handle); */ + + /* NOTE(@nsirgien): we need to enable JIT cache from here and + * right now this cache policy is controlled by env. variables. */ + /* NOTE(hallade) we also disable use of copy engine as it + * improves stability as of intel/llvm sycl-nightly/20220529. + * All these env variable can be set beforehand by end-users and + * will in that case -not- be overwritten. */ +# ifdef _WIN32 + if (getenv("SYCL_CACHE_PERSISTENT") == nullptr) { + _putenv_s("SYCL_CACHE_PERSISTENT", "1"); + } + if (getenv("SYCL_CACHE_TRESHOLD") == nullptr) { + _putenv_s("SYCL_CACHE_THRESHOLD", "0"); + } + if (getenv("SYCL_DEVICE_FILTER") == nullptr) { + _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero"); + } + if (getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE") == nullptr) { + _putenv_s("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0"); + } +# elif __linux__ + setenv("SYCL_CACHE_PERSISTENT", "1", false); + setenv("SYCL_CACHE_THRESHOLD", "0", false); + setenv("SYCL_DEVICE_FILTER", "host,level_zero", false); + setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false); +# endif + + return true; +#endif +} + +#if defined(_WIN32) || defined(__linux__) +# undef LOAD_SYCL_SHARED_LIBRARY +# undef LOAD_ONEAPI_SHARED_LIBRARY +# undef FREE_SHARED_LIBRARY +# undef GET_SHARED_LIBRARY_SYMBOL +#endif + +Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ +#ifdef WITH_ONEAPI + return new OneapiDevice(info, oneapi_dll, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Requested to create oneAPI device while not enabled for this build."; + + return nullptr; +#endif +} + +#ifdef WITH_ONEAPI +static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr) +{ + vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr; + + DeviceInfo info; + + info.type = DEVICE_ONEAPI; + info.description = name; + info.num = num; + + /* NOTE(@nsirgien): Should be unique at least on proper oneapi installation. */ + info.id = id; + + info.has_nanovdb = true; + info.denoisers = 0; + + info.has_gpu_queue = true; + + /* NOTE(@nsirgien): oneAPI right now is focused on one device usage. In future it maybe will + * change, but right now peer access from one device to another device is not supported. */ + info.has_peer_memory = false; + + /* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */ + info.display_device = false; + + devices->push_back(info); + VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\"."; +} +#endif + +void device_oneapi_info(vector<DeviceInfo> &devices) +{ +#ifdef WITH_ONEAPI + (oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices); +#else /* WITH_ONEAPI */ + (void)devices; +#endif /* WITH_ONEAPI */ +} + +string device_oneapi_capabilities() +{ + string capabilities; +#ifdef WITH_ONEAPI + char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)(); + if (c_capabilities) { + capabilities = c_capabilities; + (oneapi_dll.oneapi_free)(c_capabilities); + } +#endif + return capabilities; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/oneapi/device.h b/intern/cycles/device/oneapi/device.h new file mode 100644 index 00000000000..db8c985d4d5 --- /dev/null +++ b/intern/cycles/device/oneapi/device.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +#include "util/string.h" +#include "util/vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_oneapi_init(); + +Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_oneapi_info(vector<DeviceInfo> &devices); + +string device_oneapi_capabilities(); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp new file mode 100644 index 00000000000..8c8ab522b47 --- /dev/null +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_ONEAPI + +# include "device/oneapi/device_impl.h" + +# include "util/debug.h" +# include "util/log.h" + +# include "kernel/device/oneapi/kernel.h" + +CCL_NAMESPACE_BEGIN + +static void queue_error_cb(const char *message, void *user_ptr) +{ + if (user_ptr) { + *reinterpret_cast<std::string *>(user_ptr) = message; + } +} + +OneapiDevice::OneapiDevice(const DeviceInfo &info, + OneAPIDLLInterface &oneapi_dll_object, + Stats &stats, + Profiler &profiler) + : Device(info, stats, profiler), + device_queue_(nullptr), + texture_info_(this, "texture_info", MEM_GLOBAL), + kg_memory_(nullptr), + kg_memory_device_(nullptr), + kg_memory_size_(0), + oneapi_dll_(oneapi_dll_object) +{ + need_texture_info_ = false; + + oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_); + + /* Oneapi calls should be initialised on this moment. */ + assert(oneapi_dll_.oneapi_create_queue != nullptr); + + bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num); + if (is_finished_ok == false) { + set_error("oneAPI queue initialization error: got runtime exception \"" + + oneapi_error_string_ + "\""); + } + else { + VLOG_DEBUG << "oneAPI queue has been successfully created for the device \"" + << info.description << "\""; + assert(device_queue_); + } + + size_t globals_segment_size; + is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size); + if (is_finished_ok == false) { + set_error("oneAPI constant memory initialization got runtime exception \"" + + oneapi_error_string_ + "\""); + } + else { + VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)"; + } + + kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16); + oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size); + + kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size); + + kg_memory_size_ = globals_segment_size; +} + +OneapiDevice::~OneapiDevice() +{ + texture_info_.free(); + oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_); + oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_); + + for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++) + delete mt->second; + + if (device_queue_) + oneapi_dll_.oneapi_free_queue(device_queue_); +} + +bool OneapiDevice::check_peer_access(Device * /*peer_device*/) +{ + return false; +} + +BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +bool OneapiDevice::load_kernels(const uint requested_features) +{ + assert(device_queue_); + /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with sertain feature set + * with specialization constants, but it hasn't been implemented yet. */ + (void)requested_features; + + bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_); + if (is_finished_ok == false) { + set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\""); + } + else { + VLOG_INFO << "Runtime compilation done for \"" << info.description << "\""; + assert(device_queue_); + } + return is_finished_ok; +} + +void OneapiDevice::load_texture_info() +{ + if (need_texture_info_) { + need_texture_info_ = false; + texture_info_.copy_to_device(); + } +} + +void OneapiDevice::generic_alloc(device_memory &mem) +{ + size_t memory_size = mem.memory_size(); + + /* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then + * we can use USM host memory. + * Because of the expected performance impact, implementation of this has had a low priority + * and is not implemented yet. */ + + assert(device_queue_); + /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device + * and shared. For new project it maybe more beneficial to use USM shared memory, because it + * provides automatic migration mechanism in order to allow to use the same pointer on host and + * on device, without need to worry about explicit memory transfer operations. But for + * Blender/Cycles this type of memory is not very suitable in current application architecture, + * because Cycles already uses two different pointer for host activity and device activity, and + * also has to perform all needed memory transfer operations. So, USM device memory + * type has been used for oneAPI device in order to better fit in Cycles architecture. */ + void *device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size); + if (device_pointer == nullptr) { + size_t max_memory_on_device = oneapi_dll_.oneapi_get_memcapacity(device_queue_); + set_error("oneAPI kernel - device memory allocation error for " + + string_human_readable_size(mem.memory_size()) + + ", possibly caused by lack of available memory space on the device: " + + string_human_readable_size(stats.mem_used) + " of " + + string_human_readable_size(max_memory_on_device) + " is already allocated"); + return; + } + assert(device_pointer); + + mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer); + mem.device_size = memory_size; + + stats.mem_alloc(memory_size); +} + +void OneapiDevice::generic_copy_to(device_memory &mem) +{ + size_t memory_size = mem.memory_size(); + + /* Copy operation from host shouldn't be requested if there is no memory allocated on host. */ + assert(mem.host_pointer); + assert(device_queue_); + oneapi_dll_.oneapi_usm_memcpy( + device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size); +} + +/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */ +SyclQueue *OneapiDevice::sycl_queue() +{ + return device_queue_; +} + +string OneapiDevice::oneapi_error_message() +{ + return string(oneapi_error_string_); +} + +OneAPIDLLInterface OneapiDevice::oneapi_dll_object() +{ + return oneapi_dll_; +} + +void *OneapiDevice::kernel_globals_device_pointer() +{ + return kg_memory_device_; +} + +void OneapiDevice::generic_free(device_memory &mem) +{ + assert(mem.device_pointer); + stats.mem_free(mem.device_size); + mem.device_size = 0; + + assert(device_queue_); + oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer); + mem.device_pointer = 0; +} + +void OneapiDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + generic_alloc(mem); + } +} + +void OneapiDevice::mem_copy_to(device_memory &mem) +{ + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) + mem_alloc(mem); + + generic_copy_to(mem); + } +} + +void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) +{ + if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { + assert(!"mem_copy_from not supported for textures."); + } + else if (mem.host_pointer) { + const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size(); + const size_t offset = elem * y * w; + + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ") from offset " << offset + << " data " << size << " bytes"; + } + + assert(device_queue_); + + assert(size != 0); + assert(mem.device_pointer); + char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset; + char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset; + bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy( + device_queue_, shifted_host, shifted_device, size); + if (is_finished_ok == false) { + set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ + + "\""); + } + } +} + +void OneapiDevice::mem_zero(device_memory &mem) +{ + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")\n"; + } + + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + assert(device_queue_); + bool is_finished_ok = oneapi_dll_.oneapi_usm_memset( + device_queue_, (void *)mem.device_pointer, 0, mem.memory_size()); + if (is_finished_ok == false) { + set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ + + "\""); + } +} + +void OneapiDevice::mem_free(device_memory &mem) +{ + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", " + << string_human_readable_number(mem.device_size) << " bytes. (" + << string_human_readable_size(mem.device_size) << ")\n"; + } + + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else { + generic_free(mem); + } +} + +device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) +{ + return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) + + mem.memory_elements_size(offset)); +} + +void OneapiDevice::const_copy_to(const char *name, void *host, size_t size) +{ + assert(name); + + VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object " + << string_human_readable_number(size) << " bytes. (" + << string_human_readable_size(size) << ")"; + + ConstMemMap::iterator i = const_mem_map_.find(name); + device_vector<uchar> *data; + + if (i == const_mem_map_.end()) { + data = new device_vector<uchar>(this, name, MEM_READ_ONLY); + data->alloc(size); + const_mem_map_.insert(ConstMemMap::value_type(name, data)); + } + else { + data = i->second; + } + + assert(data->memory_size() <= size); + memcpy(data->data(), host, size); + data->copy_to_device(); + + oneapi_dll_.oneapi_set_global_memory( + device_queue_, kg_memory_, name, (void *)data->device_pointer); + + oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); +} + +void OneapiDevice::global_alloc(device_memory &mem) +{ + assert(mem.name); + + size_t size = mem.memory_size(); + VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object " + << string_human_readable_number(size) << " bytes. (" + << string_human_readable_size(size) << ")"; + + generic_alloc(mem); + generic_copy_to(mem); + + oneapi_dll_.oneapi_set_global_memory( + device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer); + + oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); +} + +void OneapiDevice::global_free(device_memory &mem) +{ + if (mem.device_pointer) { + generic_free(mem); + } +} + +void OneapiDevice::tex_alloc(device_texture &mem) +{ + generic_alloc(mem); + generic_copy_to(mem); + + /* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */ + const uint slot = mem.slot; + if (slot >= texture_info_.size()) { + texture_info_.resize(slot + 128); + } + + texture_info_[slot] = mem.info; + need_texture_info_ = true; + + texture_info_[slot].data = (uint64_t)mem.device_pointer; +} + +void OneapiDevice::tex_free(device_texture &mem) +{ + /* There is no texture memory in SYCL. */ + if (mem.device_pointer) { + generic_free(mem); + } +} + +unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create() +{ + return make_unique<OneapiDeviceQueue>(this); +} + +bool OneapiDevice::should_use_graphics_interop() +{ + /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so + * return false. */ + return false; +} + +void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment) +{ + assert(device_queue_); + return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment); +} + +void OneapiDevice::usm_free(void *usm_ptr) +{ + assert(device_queue_); + return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h new file mode 100644 index 00000000000..f925687ebe9 --- /dev/null +++ b/intern/cycles/device/oneapi/device_impl.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_ONEAPI + +# include "device/device.h" +# include "device/oneapi/device.h" +# include "device/oneapi/queue.h" + +# include "util/map.h" + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +class OneapiDevice : public Device { + private: + SyclQueue *device_queue_; + + using ConstMemMap = map<string, device_vector<uchar> *>; + ConstMemMap const_mem_map_; + device_vector<TextureInfo> texture_info_; + bool need_texture_info_; + void *kg_memory_; + void *kg_memory_device_; + size_t kg_memory_size_ = (size_t)0; + OneAPIDLLInterface oneapi_dll_; + std::string oneapi_error_string_; + + public: + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + OneapiDevice(const DeviceInfo &info, + OneAPIDLLInterface &oneapi_dll_object, + Stats &stats, + Profiler &profiler); + + virtual ~OneapiDevice(); + + bool check_peer_access(Device *peer_device) override; + + bool load_kernels(const uint requested_features) override; + + void load_texture_info(); + + void generic_alloc(device_memory &mem); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + SyclQueue *sycl_queue(); + + string oneapi_error_message(); + + OneAPIDLLInterface oneapi_dll_object(); + + void *kernel_globals_device_pointer(); + + void mem_alloc(device_memory &mem) override; + + void mem_copy_to(device_memory &mem) override; + + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; + + void mem_copy_from(device_memory &mem) + { + mem_copy_from(mem, 0, 0, 0, 0); + } + + void mem_zero(device_memory &mem) override; + + void mem_free(device_memory &mem) override; + + device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + + void tex_free(device_texture &mem); + + /* Graphics resources interoperability. */ + virtual bool should_use_graphics_interop() override; + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + /* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host + * side compilation (MSVC). */ + void *usm_aligned_alloc_host(size_t memory_size, size_t alignment); + void usm_free(void *usm_ptr); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/oneapi/dll_interface.h b/intern/cycles/device/oneapi/dll_interface.h new file mode 100644 index 00000000000..bc681ff8f64 --- /dev/null +++ b/intern/cycles/device/oneapi/dll_interface.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +/* Include kernel header to get access to sycl-specific types, like SyclQueue and + * OneAPIDeviceIteratorCallback. */ +#include "kernel/device/oneapi/kernel.h" + +#ifdef WITH_ONEAPI +struct OneAPIDLLInterface { +# define DLL_INTERFACE_CALL(function, return_type, ...) \ + return_type (*function)(__VA_ARGS__) = nullptr; +# include "kernel/device/oneapi/dll_interface_template.h" +# undef DLL_INTERFACE_CALL +}; +#endif diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp new file mode 100644 index 00000000000..42e2408ee7a --- /dev/null +++ b/intern/cycles/device/oneapi/queue.cpp @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_ONEAPI + +# include "device/oneapi/queue.h" +# include "device/oneapi/device_impl.h" +# include "util/log.h" +# include "util/time.h" +# include <iomanip> +# include <vector> + +# include "kernel/device/oneapi/kernel.h" + +CCL_NAMESPACE_BEGIN + +struct KernelExecutionInfo { + double elapsed_summary = 0.0; + int enqueue_count = 0; +}; + +/* OneapiDeviceQueue */ + +OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device) + : DeviceQueue(device), + oneapi_device_(device), + oneapi_dll_(device->oneapi_dll_object()), + kernel_context_(nullptr) +{ +} + +OneapiDeviceQueue::~OneapiDeviceQueue() +{ + delete kernel_context_; +} + +int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const +{ + int num_states; + + /* TODO: implement and use get_num_multiprocessors and get_max_num_threads_per_multiprocessor. */ + const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount( + oneapi_device_->sycl_queue()); + if (compute_units >= 128) { + /* dGPU path, make sense to allocate more states, because it will be dedicated GPU memory. */ + int base = 1024 * 1024; + /* linear dependency (with coefficient less that 1) from amount of compute units. */ + num_states = (base * (compute_units / 128)) * 3 / 4; + + /* Limit amount of integrator states by one quarter of device memory, because + * other allocations will need some space as well + * TODO: base this calculation on the how many states what the GPU is actually capable of + * running, with some headroom to improve occupancy. If the texture don't fit, offload into + * unified memory. */ + size_t states_memory_size = num_states * state_size; + size_t device_memory_amount = + (oneapi_dll_.oneapi_get_memcapacity)(oneapi_device_->sycl_queue()); + if (states_memory_size >= device_memory_amount / 4) { + num_states = device_memory_amount / 4 / state_size; + } + } + else { + /* iGPU path - no real need to allocate a lot of integrator states because it is shared GPU + * memory. */ + num_states = 1024 * 512; + } + + VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); + + return num_states; +} + +int OneapiDeviceQueue::num_concurrent_busy_states() const +{ + const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount( + oneapi_device_->sycl_queue()); + if (compute_units >= 128) { + return 1024 * 1024; + } + else { + return 1024 * 512; + } +} + +void OneapiDeviceQueue::init_execution() +{ + oneapi_device_->load_texture_info(); + + SyclQueue *device_queue = oneapi_device_->sycl_queue(); + void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer(); + assert(device_queue); + assert(kg_dptr); + kernel_context_ = new KernelContext{device_queue, kg_dptr}; + + debug_init_execution(); +} + +bool OneapiDeviceQueue::enqueue(DeviceKernel kernel, + const int signed_kernel_work_size, + DeviceKernelArguments const &_args) +{ + if (oneapi_device_->have_error()) { + return false; + } + + void **args = const_cast<void **>(_args.values); + + debug_enqueue(kernel, signed_kernel_work_size); + assert(signed_kernel_work_size >= 0); + size_t kernel_work_size = (size_t)signed_kernel_work_size; + + size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size( + kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size); + size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size); + + assert(kernel_context_); + + /* Call the oneAPI kernel DLL to launch the requested kernel. */ + bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel( + kernel_context_, kernel, uniformed_kernel_work_size, args); + + if (is_finished_ok == false) { + oneapi_device_->set_error("oneAPI kernel \"" + std::string(device_kernel_as_string(kernel)) + + "\" execution error: got runtime exception \"" + + oneapi_device_->oneapi_error_message() + "\""); + } + + return is_finished_ok; +} + +bool OneapiDeviceQueue::synchronize() +{ + if (oneapi_device_->have_error()) { + return false; + } + + bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue()); + if (is_finished_ok == false) + oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" + + oneapi_device_->oneapi_error_message() + "\""); + + debug_synchronize(); + + return !(oneapi_device_->have_error()); +} + +void OneapiDeviceQueue::zero_to_device(device_memory &mem) +{ + oneapi_device_->mem_zero(mem); +} + +void OneapiDeviceQueue::copy_to_device(device_memory &mem) +{ + oneapi_device_->mem_copy_to(mem); +} + +void OneapiDeviceQueue::copy_from_device(device_memory &mem) +{ + oneapi_device_->mem_copy_from(mem); +} + +CCL_NAMESPACE_END + +#endif /* WITH_ONEAPI */ diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h new file mode 100644 index 00000000000..09a015303b6 --- /dev/null +++ b/intern/cycles/device/oneapi/queue.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +#ifdef WITH_ONEAPI + +# include "device/kernel.h" +# include "device/memory.h" +# include "device/queue.h" + +# include "device/oneapi/device.h" +# include "device/oneapi/dll_interface.h" + +CCL_NAMESPACE_BEGIN + +class OneapiDevice; +class device_memory; + +/* Base class for Oneapi queues. */ +class OneapiDeviceQueue : public DeviceQueue { + public: + explicit OneapiDeviceQueue(OneapiDevice *device); + ~OneapiDeviceQueue(); + + virtual int num_concurrent_states(const size_t state_size) const override; + + virtual int num_concurrent_busy_states() const override; + + virtual void init_execution() override; + + virtual bool enqueue(DeviceKernel kernel, + const int kernel_work_size, + DeviceKernelArguments const &args) override; + + virtual bool synchronize() override; + + virtual void zero_to_device(device_memory &mem) override; + virtual void copy_to_device(device_memory &mem) override; + virtual void copy_from_device(device_memory &mem) override; + + protected: + OneapiDevice *oneapi_device_; + OneAPIDLLInterface oneapi_dll_; + KernelContext *kernel_context_; + bool with_kernel_statistics_; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_ONEAPI */ diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 9ad1c465725..6912bf928cd 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -1103,6 +1103,8 @@ static const char *device_type_for_description(const DeviceType type) return "OptiX"; case DEVICE_HIP: return "HIP"; + case DEVICE_ONEAPI: + return "oneAPI"; case DEVICE_DUMMY: return "Dummy"; case DEVICE_MULTI: diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index a07d7852211..ccd694dfdfd 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -37,6 +37,10 @@ set(SRC_KERNEL_DEVICE_OPTIX device/optix/kernel_shader_raytrace.cu ) +set(SRC_KERNEL_DEVICE_ONEAPI + device/oneapi/kernel.cpp +) + set(SRC_KERNEL_DEVICE_CPU_HEADERS device/cpu/compat.h device/cpu/image.h @@ -78,6 +82,17 @@ set(SRC_KERNEL_DEVICE_METAL_HEADERS device/metal/globals.h ) +set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS + device/oneapi/compat.h + device/oneapi/context_begin.h + device/oneapi/context_end.h + device/oneapi/device_id.h + device/oneapi/globals.h + device/oneapi/image.h + device/oneapi/kernel.h + device/oneapi/kernel_templates.h +) + set(SRC_KERNEL_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -687,6 +702,212 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) cycles_set_solution_folder(cycles_kernel_optix) endif() +if(WITH_CYCLES_DEVICE_ONEAPI) + if(WIN32) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll) + else() + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.so) + endif() + + set(cycles_oneapi_kernel_sources + ${SRC_KERNEL_DEVICE_ONEAPI} + ${SRC_KERNEL_HEADERS} + ${SRC_KERNEL_DEVICE_GPU_HEADERS} + ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS} + ${SRC_UTIL_HEADERS} + ) + + # SYCL_CPP_FLAGS is a variable that the user can set to pass extra compiler options + set(sycl_compiler_flags + ${CMAKE_CURRENT_SOURCE_DIR}/${SRC_KERNEL_DEVICE_ONEAPI} + -fsycl + -fsycl-unnamed-lambda + -fdelayed-template-parsing + -mllvm -inlinedefault-threshold=300 + -mllvm -inlinehint-threshold=400 + -shared + -DWITH_ONEAPI + -ffast-math + -DNDEBUG + -O2 + -o ${cycles_kernel_oneapi_lib} + -I${CMAKE_CURRENT_SOURCE_DIR}/.. + -I${LEVEL_ZERO_INCLUDE_DIR} + ${LEVEL_ZERO_LIBRARY} + ${SYCL_CPP_FLAGS} + ) + + + if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED) + list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED) + endif() + + # Set defaults for spir64 and spir64_gen options + if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64) + set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'") + endif() + if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen) + SET (CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}" CACHE STRING "Extra build options for spir64_gen target") + endif() + # enabling zebin (graphics binary format with improved compatibility) on Windows only while support on Linux isn't available yet + if(WIN32) + string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "--format zebin ") + endif() + string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ") + + if (WITH_CYCLES_ONEAPI_BINARIES) + # Iterate over all targest and their options + list (JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string) + list (APPEND sycl_compiler_flags -fsycl-targets=${targets_string}) + foreach(target ${CYCLES_ONEAPI_SYCL_TARGETS}) + if(DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_${target}) + list (APPEND sycl_compiler_flags -Xsycl-target-backend=${target} "${CYCLES_ONEAPI_SYCL_OPTIONS_${target}}") + endif() + endforeach() + else() + # If AOT is disabled, build for spir64 + list(APPEND sycl_compiler_flags + -fsycl-targets=spir64 + -Xsycl-target-backend=spir64 "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}") + endif() + + if(WITH_NANOVDB) + list(APPEND sycl_compiler_flags + -DWITH_NANOVDB + -I"${NANOVDB_INCLUDE_DIR}") + endif() + + if(WITH_CYCLES_DEBUG) + list(APPEND sycl_compiler_flags -DWITH_CYCLES_DEBUG) + endif() + + get_filename_component(sycl_compiler_root ${SYCL_COMPILER} DIRECTORY) + get_filename_component(sycl_compiler_compiler_name ${SYCL_COMPILER} NAME_WE) + + if(NOT OCLOC_INSTALL_DIR) + get_filename_component(OCLOC_INSTALL_DIR "${sycl_compiler_root}/../lib/ocloc" ABSOLUTE) + endif() + if(WITH_CYCLES_ONEAPI_BINARIES AND NOT EXISTS ${OCLOC_INSTALL_DIR}) + message(FATAL_ERROR "WITH_CYCLES_ONEAPI_BINARIES requires ocloc but ${OCLOC_INSTALL_DIR} directory doesn't exist." + " A different ocloc directory can be set using OCLOC_INSTALL_DIR cmake variable.") + endif() + + if(UNIX AND NOT APPLE) + if(NOT WITH_CXX11_ABI) + check_library_exists(sycl + _ZN2cl4sycl7handler22verifyUsedKernelBundleERKSs ${sycl_compiler_root}/../lib SYCL_NO_CXX11_ABI) + if(SYCL_NO_CXX11_ABI) + list(APPEND sycl_compiler_flags -D_GLIBCXX_USE_CXX11_ABI=0) + endif() + endif() + endif() + + if(WIN32) + list(APPEND sycl_compiler_flags + -fms-extensions + -fms-compatibility + -D_WINDLL + -D_MBCS + -DWIN32 + -D_WINDOWS + -D_CRT_NONSTDC_NO_DEPRECATE + -D_CRT_SECURE_NO_DEPRECATE + -DONEAPI_EXPORT) + + if(sycl_compiler_compiler_name MATCHES "dpcpp") + # The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables. + add_custom_command( + OUTPUT ${cycles_kernel_oneapi_lib} + COMMAND "${sycl_compiler_root}/../../env/vars.bat" + COMMAND ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags} + DEPENDS ${cycles_oneapi_kernel_sources}) + else() + # The open source SYCL compiler just goes by clang++ and does not have such a script. + # Set the variables manually. + string(REPLACE /Redist/ /Tools/ MSVC_TOOLS_DIR ${MSVC_REDIST_DIR}) + if(NOT CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION) # case for Ninja on Windows + get_filename_component(cmake_mt_dir ${CMAKE_MT} DIRECTORY) + string(REPLACE /bin/ /Lib/ WINDOWS_KIT_DIR ${cmake_mt_dir}) + get_filename_component(WINDOWS_KIT_DIR "${WINDOWS_KIT_DIR}/../" ABSOLUTE) + else() + set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}) + endif() + list(APPEND sycl_compiler_flags + -L "${MSVC_TOOLS_DIR}/lib/x64" + -L "${WINDOWS_KIT_DIR}/um/x64" + -L "${WINDOWS_KIT_DIR}/ucrt/x64") + add_custom_command( + OUTPUT ${cycles_kernel_oneapi_lib} + COMMAND ${CMAKE_COMMAND} -E env + "LIB=${sycl_compiler_root}/../lib" # for compiler to find sycl.lib + "PATH=${OCLOC_INSTALL_DIR};${sycl_compiler_root}" + ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags} + DEPENDS ${cycles_oneapi_kernel_sources}) + endif() + else() + list(APPEND sycl_compiler_flags -fPIC) + + # avoid getting __FAST_MATH__ to be defined for the graphics compiler on CentOS 7 until the compile-time issue it triggers gets fixed. + if(WITH_CYCLES_ONEAPI_BINARIES) + list(APPEND sycl_compiler_flags -fhonor-nans) + endif() + + # add $ORIGIN to cycles_kernel_oneapi.so rpath so libsycl.so and + # libpi_level_zero.so can be placed next to it and get found. + list(APPEND sycl_compiler_flags -Wl,-rpath,'$$ORIGIN') + + # The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables. + if(sycl_compiler_compiler_name MATCHES "dpcpp") + add_custom_command( + OUTPUT ${cycles_kernel_oneapi_lib} + COMMAND bash -c \"source ${sycl_compiler_root}/../../env/vars.sh&&${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}\" + DEPENDS ${cycles_oneapi_kernel_sources}) + else() + # The open source SYCL compiler just goes by clang++ and does not have such a script. + # Set the variables manually. + if(NOT IGC_INSTALL_DIR) + get_filename_component(IGC_INSTALL_DIR "${sycl_compiler_root}/../lib/igc" ABSOLUTE) + endif() + add_custom_command( + OUTPUT ${cycles_kernel_oneapi_lib} + COMMAND ${CMAKE_COMMAND} -E env + "LD_LIBRARY_PATH=${sycl_compiler_root}/../lib:${OCLOC_INSTALL_DIR}/lib:${IGC_INSTALL_DIR}/lib" + "PATH=${OCLOC_INSTALL_DIR}/bin:${sycl_compiler_root}:$ENV{PATH}" # env PATH is for compiler to find ld + ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags} + DEPENDS ${cycles_oneapi_kernel_sources}) + endif() + endif() + + # install dynamic libraries required at runtime + if(WIN32) + set(SYCL_RUNTIME_DEPENDENCIES + sycl.dll + pi_level_zero.dll + ) + if(NOT WITH_BLENDER) + # For the Cycles standalone put libraries next to the Cycles application. + delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}) + else() + # For Blender put the libraries next to the Blender executable. + # + # Note that the installation path in the delayed_install is relative to the versioned folder, + # which means we need to go one level up. + delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" "../") + endif() + elseif(UNIX AND NOT APPLE) + file(GLOB SYCL_RUNTIME_DEPENDENCIES + ${sycl_compiler_root}/../lib/libsycl.so + ${sycl_compiler_root}/../lib/libsycl.so.[0-9] + ${sycl_compiler_root}/../lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9] + ) + list(APPEND SYCL_RUNTIME_DEPENDENCIES ${sycl_compiler_root}/../lib/libpi_level_zero.so) + delayed_install("" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}/lib) + endif() + + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cycles_kernel_oneapi_lib}" ${CYCLES_INSTALL_PATH}/lib) + add_custom_target(cycles_kernel_oneapi ALL DEPENDS ${cycles_kernel_oneapi_lib}) +endif() + # OSL module if(WITH_CYCLES_OSL) @@ -752,6 +973,7 @@ cycles_add_library(cycles_kernel "${LIB}" ${SRC_KERNEL_DEVICE_HIP_HEADERS} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS} ${SRC_KERNEL_DEVICE_METAL_HEADERS} + ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS} ) source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS}) @@ -764,6 +986,7 @@ source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS}) source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS}) source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}) source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS}) +source_group("device\\oneapi" FILES ${SRC_KERNEL_DEVICE_ONEAPI} ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}) source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS}) source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS}) source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS}) @@ -782,6 +1005,9 @@ endif() if(WITH_CYCLES_HIP) add_dependencies(cycles_kernel cycles_kernel_hip) endif() +if(WITH_CYCLES_DEVICE_ONEAPI) + add_dependencies(cycles_kernel cycles_kernel_oneapi) +endif() # Install kernel source for runtime compilation diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index d657571a5fa..b9a44ccad02 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -14,6 +14,8 @@ #ifdef __KERNEL_METAL__ # include "kernel/device/metal/context_begin.h" +#elif defined(__KERNEL_ONEAPI__) +# include "kernel/device/oneapi/context_begin.h" #endif #include "kernel/device/gpu/work_stealing.h" @@ -40,6 +42,8 @@ #ifdef __KERNEL_METAL__ # include "kernel/device/metal/context_end.h" +#elif defined(__KERNEL_ONEAPI__) +# include "kernel/device/oneapi/context_end.h" #endif #include "kernel/film/read.h" diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index 7d7266d5edf..c1df49c4f49 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif -#ifndef __KERNEL_METAL__ +/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads, + * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices + * and keep device specific code in compat.h */ + +#ifdef __KERNEL_ONEAPI__ +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +template<typename IsActiveOp> +void cpu_serial_active_index_array_impl(const uint num_states, + ccl_global int *ccl_restrict indices, + ccl_global int *ccl_restrict num_indices, + IsActiveOp is_active_op) +{ + int write_index = 0; + for (int state_index = 0; state_index < num_states; state_index++) { + if (is_active_op(state_index)) + indices[write_index++] = state_index; + } + *num_indices = write_index; + return; +} +# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */ + +template<typename IsActiveOp> +void gpu_parallel_active_index_array_impl(const uint num_states, + ccl_global int *ccl_restrict indices, + ccl_global int *ccl_restrict num_indices, + IsActiveOp is_active_op) +{ + const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>(); + const uint blocksize = item_id.get_local_range(0); + + sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1], + sycl::access::address_space::local_space> + ptr = sycl::ext::oneapi::group_local_memory< + int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group()); + int *warp_offset = *ptr; + + /* NOTE(@nsirgien): Here we calculate the same value as below but + * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into + * something faster already but DPC++ doesn't, so it's better to use + * direct request of needed parameters - switching from this computation to computation below + * will cause 2.5x performance slowdown. */ + const uint thread_index = item_id.get_local_id(0); + const uint thread_warp = item_id.get_sub_group().get_local_id(); + + const uint warp_index = item_id.get_sub_group().get_group_id(); + const uint num_warps = item_id.get_sub_group().get_group_range()[0]; + + const uint state_index = item_id.get_global_id(0); + + /* Test if state corresponding to this thread is active. */ + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; +#else /* !__KERNEL__ONEAPI__ */ +# ifndef __KERNEL_METAL__ template<uint blocksize, typename IsActiveOp> __device__ -#endif +# endif void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *indices, ccl_global int *num_indices, -#ifdef __KERNEL_METAL__ +# ifdef __KERNEL_METAL__ const uint is_active, const uint blocksize, const int thread_index, @@ -37,7 +90,7 @@ __device__ const int num_warps, threadgroup int *warp_offset) { -#else +# else IsActiveOp is_active_op) { extern ccl_gpu_shared int warp_offset[]; @@ -52,18 +105,33 @@ __device__ /* Test if state corresponding to this thread is active. */ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; -#endif - +# endif +#endif /* !__KERNEL_ONEAPI__ */ /* For each thread within a warp compute how many other active states precede it. */ +#ifdef __KERNEL_ONEAPI__ + const uint thread_offset = sycl::exclusive_scan_over_group( + item_id.get_sub_group(), is_active, std::plus<>()); +#else const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & ccl_gpu_thread_mask(thread_warp)); +#endif /* Last thread in warp stores number of active states for each warp. */ +#ifdef __KERNEL_ONEAPI__ + if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) { +#else if (thread_warp == ccl_gpu_warp_size - 1) { +#endif warp_offset[warp_index] = thread_offset + is_active; } +#ifdef __KERNEL_ONEAPI__ + /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important, + * so faster local barriers can be used. */ + ccl_gpu_local_syncthreads(); +#else ccl_gpu_syncthreads(); +#endif /* Last thread in block converts per-warp sizes to offsets, increments global size of * index array and gets offset to write to. */ @@ -80,7 +148,13 @@ __device__ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); } +#ifdef __KERNEL_ONEAPI__ + /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset), + * so faster local barriers can be used. */ + ccl_gpu_local_syncthreads(); +#else ccl_gpu_syncthreads(); +#endif /* Write to index array. */ if (is_active) { @@ -107,7 +181,19 @@ __device__ simd_group_index, \ num_simd_groups, \ simdgroup_offset) - +#elif defined(__KERNEL_ONEAPI__) +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + if (ccl_gpu_global_size_x() == 1) \ + cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \ + else \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op); +# else +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) +# endif #else # define gpu_parallel_active_index_array( \ diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h new file mode 100644 index 00000000000..30b0f088ede --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_ONEAPI__ + +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#include <cstdint> + +#ifndef __NODES_MAX_GROUP__ +# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX +#endif +#ifndef __NODES_FEATURES__ +# define __NODES_FEATURES__ NODE_FEATURE_ALL +#endif + +/* This one does not have an abstraction. + * It's used by other devices directly. + */ + +#define __device__ + +/* Qualifier wrappers for different names on different devices */ + +#define ccl_device +#define ccl_global +#define ccl_always_inline __attribute__((always_inline)) +#define ccl_device_inline inline +#define ccl_noinline +#define ccl_inline_constant const constexpr +#define ccl_static_constant const +#define ccl_device_forceinline __attribute__((always_inline)) +#define ccl_device_noinline ccl_device ccl_noinline +#define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device +#define ccl_restrict __restrict__ +#define ccl_loop_no_unroll +#define ccl_optional_struct_init +#define ccl_private +#define ATTR_FALLTHROUGH __attribute__((fallthrough)) +#define ccl_constant const +#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) +#define ccl_align(n) __attribute__((aligned(n))) +#define kernel_assert(cond) +#define ccl_may_alias + +/* clang-format off */ + +/* kernel.h adapters */ +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) +#define ccl_gpu_kernel_threads(block_num_threads) + +#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +# define KG_ND_ITEMS \ + kg->nd_item_local_id_0 = item.get_local_id(0); \ + kg->nd_item_local_range_0 = item.get_local_range(0); \ + kg->nd_item_group_0 = item.get_group(0); \ + kg->nd_item_group_range_0 = item.get_group_range(0); \ + kg->nd_item_global_id_0 = item.get_global_id(0); \ + kg->nd_item_global_range_0 = item.get_global_range(0); +#else +# define KG_ND_ITEMS +#endif + +#define ccl_gpu_kernel_signature(name, ...) \ +void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ + size_t kernel_global_size, \ + size_t kernel_local_size, \ + sycl::handler &cgh, \ + __VA_ARGS__) { \ + (kg); \ + cgh.parallel_for<class kernel_##name>( \ + sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ + [=](sycl::nd_item<1> item) { \ + KG_ND_ITEMS + +#define ccl_gpu_kernel_postfix \ + }); \ + } + +#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x + +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda \ + { \ + KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \ + ccl_private const ONEAPIKernelContext *kg; \ + __VA_ARGS__; \ + int operator()(const int state) const { return (func); } \ + } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg) + +/* GPU thread, block, grid size and index */ +#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED +# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) +# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) +# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) +# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) +# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) + +# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) +# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) +#else +# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0) +# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0) +# define ccl_gpu_block_idx_x (kg->nd_item_group_0) +# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0) +# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) + +# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0) +# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0) +#endif + + +/* GPU warp synchronization */ + +#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier() +#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space) +#ifdef __SYCL_DEVICE_ONLY__ + #define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count()) +#else + #define ccl_gpu_ballot(predicate) (predicate ? 1 : 0) +#endif + +/* Debug defines */ +#if defined(__SYCL_DEVICE_ONLY__) +# define CONSTANT __attribute__((opencl_constant)) +#else +# define CONSTANT +#endif + +#define sycl_printf(format, ...) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ ); \ + } + +#define sycl_printf_(format) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt); \ + } + +/* GPU texture objects */ + +/* clang-format on */ + +/* Types */ +/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc + * because these types have different interfaces from blender version */ + +using uchar = unsigned char; +using sycl::half; + +struct float3 { + float x, y, z; +}; + +ccl_always_inline float3 make_float3(float x, float y, float z) +{ + return {x, y, z}; +} +ccl_always_inline float3 make_float3(float x) +{ + return {x, x, x}; +} + +/* math functions */ +#define fabsf(x) sycl::fabs((x)) +#define copysignf(x, y) sycl::copysign((x), (y)) +#define asinf(x) sycl::asin((x)) +#define acosf(x) sycl::acos((x)) +#define atanf(x) sycl::atan((x)) +#define floorf(x) sycl::floor((x)) +#define ceilf(x) sycl::ceil((x)) +#define sinhf(x) sycl::sinh((x)) +#define coshf(x) sycl::cosh((x)) +#define tanhf(x) sycl::tanh((x)) +#define hypotf(x, y) sycl::hypot((x), (y)) +#define atan2f(x, y) sycl::atan2((x), (y)) +#define fmaxf(x, y) sycl::fmax((x), (y)) +#define fminf(x, y) sycl::fmin((x), (y)) +#define fmodf(x, y) sycl::fmod((x), (y)) +#define lgammaf(x) sycl::lgamma((x)) + +#define __forceinline __attribute__((always_inline)) + +/* Types */ +#include "util/half.h" +#include "util/types.h" + +/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they + * include oneAPI headers, which transitively include math.h headers which will cause redefintions + * of the math defines because math.h also uses them and having them defined before math.h include + * is actually UB. */ +/* Use fast math functions - get them from sycl::native namespace for native math function + * implementations */ +#define cosf(x) sycl::native::cos(((float)(x))) +#define sinf(x) sycl::native::sin(((float)(x))) +#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y))) +#define tanf(x) sycl::native::tan(((float)(x))) +#define logf(x) sycl::native::log(((float)(x))) +#define expf(x) sycl::native::exp(((float)(x))) diff --git a/intern/cycles/kernel/device/oneapi/context_begin.h b/intern/cycles/kernel/device/oneapi/context_begin.h new file mode 100644 index 00000000000..6d6f8cec4ca --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/context_begin.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_NANOVDB +# include <nanovdb/NanoVDB.h> +# include <nanovdb/util/SampleFromVoxels.h> +#endif + +/* clang-format off */ +struct ONEAPIKernelContext : public KernelGlobalsGPU { + public: +# include "kernel/device/oneapi/image.h" + /* clang-format on */ diff --git a/intern/cycles/kernel/device/oneapi/context_end.h b/intern/cycles/kernel/device/oneapi/context_end.h new file mode 100644 index 00000000000..ddf0d1f1712 --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/context_end.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ +} +; /* end of ONEAPIKernelContext class definition */ + +#undef kernel_integrator_state +#define kernel_integrator_state (*(kg->integrator_state)) diff --git a/intern/cycles/kernel/device/oneapi/device_id.h b/intern/cycles/kernel/device/oneapi/device_id.h new file mode 100644 index 00000000000..b4c94ac27a2 --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/device_id.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +/* from public source : + * https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/include/pci_ids/iris_pci_ids.h */ +const static std::set<uint32_t> intel_arc_alchemist_device_ids = { + 0x4f80, 0x4f81, 0x4f82, 0x4f83, 0x4f84, 0x4f87, 0x4f88, 0x5690, 0x5691, + 0x5692, 0x5693, 0x5694, 0x5695, 0x5696, 0x5697, 0x56a0, 0x56a1, 0x56a2, + 0x56a3, 0x56a4, 0x56a5, 0x56a6, 0x56b0, 0x56b1, 0x56b2, 0x56b3}; diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h new file mode 100644 index 00000000000..2d740b4c64a --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/dll_interface_template.h @@ -0,0 +1,50 @@ +/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */ +DLL_INTERFACE_CALL(oneapi_device_capabilities, char *) +DLL_INTERFACE_CALL(oneapi_free, void, void *) +DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue) + +DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue) +DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr) +DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr) + +DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index) +DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue) +DLL_INTERFACE_CALL( + oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment) +DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size) +DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr) + +DLL_INTERFACE_CALL( + oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes) +DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue) +DLL_INTERFACE_CALL(oneapi_usm_memset, + bool, + SyclQueue *queue, + void *usm_ptr, + unsigned char value, + size_t num_bytes) + +DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue) + +/* Operation with Kernel globals structure - map of global/constant allocation - filled before + * render/kernel execution As we don't know in cycles sizeof this - Cycles will manage just as + * pointer. */ +DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size) +DLL_INTERFACE_CALL(oneapi_set_global_memory, + void, + SyclQueue *queue, + void *kernel_globals, + const char *memory_name, + void *memory_device_pointer) + +DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size, + size_t, + SyclQueue *queue, + const DeviceKernel kernel, + const size_t kernel_global_size) +DLL_INTERFACE_CALL(oneapi_enqueue_kernel, + bool, + KernelContext *context, + int kernel, + size_t global_size, + void **args) diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h new file mode 100644 index 00000000000..d60f4f135ba --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/globals.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +#include "kernel/integrator/state.h" +#include "kernel/types.h" +#include "kernel/util/profiling.h" + +CCL_NAMESPACE_BEGIN + +/* NOTE(@nsirgien): With SYCL we can't declare __constant__ global variable, which will be + * accessible from device code, like it has been done for Cycles CUDA backend. So, the backend will + * allocate this "constant" memory regions and store pointers to them in oneAPI context class */ + +struct IntegratorStateGPU; +struct IntegratorQueueCounter; + +typedef struct KernelGlobalsGPU { + +#define KERNEL_DATA_ARRAY(type, name) const type *__##name = nullptr; +#include "kernel/data_arrays.h" +#undef KERNEL_DATA_ARRAY + IntegratorStateGPU *integrator_state; + const KernelData *__data; +#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED + size_t nd_item_local_id_0; + size_t nd_item_local_range_0; + size_t nd_item_group_0; + size_t nd_item_group_range_0; + + size_t nd_item_global_id_0; + size_t nd_item_global_range_0; +#endif +} KernelGlobalsGPU; + +typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals; + +#define kernel_data (*(__data)) +#define kernel_integrator_state (*(integrator_state)) + +/* data lookup defines */ + +#define kernel_data_fetch(name, index) __##name[index] +#define kernel_data_array(name) __##name + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h new file mode 100644 index 00000000000..892558d40bf --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/image.h @@ -0,0 +1,385 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +CCL_NAMESPACE_BEGIN + +/* For oneAPI implementation we do manual lookup and interpolation. */ +/* TODO: share implementation with ../cpu/image.h. */ + +template<typename T> ccl_device_forceinline T tex_fetch(const TextureInfo &info, int index) +{ + return reinterpret_cast<ccl_global T *>(info.data)[index]; +} + +ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) +{ + x %= width; + if (x < 0) + x += width; + return x; +} + +ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) +{ + return clamp(x, 0, width - 1); +} + +ccl_device_inline float4 svm_image_texture_read(const TextureInfo &info, int x, int y, int z) +{ + const int data_offset = x + info.width * y + info.width * info.height * z; + const int texture_type = info.data_type; + + /* Float4 */ + if (texture_type == IMAGE_DATA_TYPE_FLOAT4) { + return tex_fetch<float4>(info, data_offset); + } + /* Byte4 */ + else if (texture_type == IMAGE_DATA_TYPE_BYTE4) { + uchar4 r = tex_fetch<uchar4>(info, data_offset); + float f = 1.0f / 255.0f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + /* Ushort4 */ + else if (texture_type == IMAGE_DATA_TYPE_USHORT4) { + ushort4 r = tex_fetch<ushort4>(info, data_offset); + float f = 1.0f / 65535.f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + /* Float */ + else if (texture_type == IMAGE_DATA_TYPE_FLOAT) { + float f = tex_fetch<float>(info, data_offset); + return make_float4(f, f, f, 1.0f); + } + /* UShort */ + else if (texture_type == IMAGE_DATA_TYPE_USHORT) { + ushort r = tex_fetch<ushort>(info, data_offset); + float f = r * (1.0f / 65535.0f); + return make_float4(f, f, f, 1.0f); + } + else if (texture_type == IMAGE_DATA_TYPE_HALF) { + float f = tex_fetch<half>(info, data_offset); + return make_float4(f, f, f, 1.0f); + } + else if (texture_type == IMAGE_DATA_TYPE_HALF4) { + half4 r = tex_fetch<half4>(info, data_offset); + return make_float4(r.x, r.y, r.z, r.w); + } + /* Byte */ + else { + uchar r = tex_fetch<uchar>(info, data_offset); + float f = r * (1.0f / 255.0f); + return make_float4(f, f, f, 1.0f); + } +} + +ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y) +{ + const TextureInfo &info = kernel_data_fetch(texture_info, id); + + /* Wrap */ + if (info.extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info.width); + y = svm_image_texture_wrap_periodic(y, info.height); + } + else { + x = svm_image_texture_wrap_clamp(x, info.width); + y = svm_image_texture_wrap_clamp(y, info.height); + } + + return svm_image_texture_read(info, x, y, 0); +} + +ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z) +{ + const TextureInfo &info = kernel_data_fetch(texture_info, id); + + /* Wrap */ + if (info.extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info.width); + y = svm_image_texture_wrap_periodic(y, info.height); + z = svm_image_texture_wrap_periodic(z, info.depth); + } + else { + x = svm_image_texture_wrap_clamp(x, info.width); + y = svm_image_texture_wrap_clamp(y, info.height); + z = svm_image_texture_wrap_clamp(z, info.depth); + } + + return svm_image_texture_read(info, x, y, z); +} + +static float svm_image_texture_frac(float x, int *ix) +{ + int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0); + *ix = i; + return x - (float)i; +} + +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \ + u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \ + u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } \ + (void)0 + +ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float y) +{ + const TextureInfo &info = kernel_data_fetch(texture_info, id); + + if (info.extension == EXTENSION_CLIP) { + if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + + if (info.interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ + int ix, iy; + svm_image_texture_frac(x * info.width, &ix); + svm_image_texture_frac(y * info.height, &iy); + + return svm_image_texture_read_2d(id, ix, iy); + } + else if (info.interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + int ix, iy; + float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix); + float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy); + + float4 r; + r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy); + r += (1.0f - ty) * tx * svm_image_texture_read_2d(id, ix + 1, iy); + r += ty * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy + 1); + r += ty * tx * svm_image_texture_read_2d(id, ix + 1, iy + 1); + return r; + } + else { + /* Bicubic interpolation. */ + int ix, iy; + float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix); + float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy); + + float u[4], v[4]; + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + float weight = u[x] * v[y]; + r += weight * svm_image_texture_read_2d(id, ix + x - 1, iy + y - 1); + } + } + return r; + } +} + +#ifdef WITH_NANOVDB +template<typename T> struct NanoVDBInterpolator { + + typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType; + + static ccl_always_inline float4 read(float r) + { + return make_float4(r, r, r, 1.0f); + } + + static ccl_always_inline float4 read(nanovdb::Vec3f r) + { + return make_float4(r[0], r[1], r[2], 1.0f); + } + + static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc, + float x, + float y, + float z) + { + const nanovdb::Vec3f xyz(x, y, z); + return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz)); + } + + static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc, + float x, + float y, + float z) + { + const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f); + return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz)); + } + + static float4 interp_3d_cubic(const AccessorType &acc, float x, float y, float z) + { + int ix, iy, iz; + int nix, niy, niz; + int pix, piy, piz; + int nnix, nniy, nniz; + /* Tricubic b-spline interpolation. */ + const float tx = svm_image_texture_frac(x - 0.5f, &ix); + const float ty = svm_image_texture_frac(y - 0.5f, &iy); + const float tz = svm_image_texture_frac(z - 0.5f, &iz); + pix = ix - 1; + piy = iy - 1; + piz = iz - 1; + nix = ix + 1; + niy = iy + 1; + niz = iz + 1; + nnix = ix + 2; + nniy = iy + 2; + nniz = iz + 2; + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {piy, iy, niy, nniy}; + const int zc[4] = {piz, iz, niz, nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +# define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z])))) +# define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +# define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +# undef COL_TERM +# undef ROW_TERM +# undef DATA + } + + static ccl_always_inline float4 + interp_3d(const TextureInfo &info, float x, float y, float z, int interp) + { + using namespace nanovdb; + + NanoGrid<T> *const grid = (NanoGrid<T> *)info.data; + AccessorType acc = grid->getAccessor(); + + switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) { + case INTERPOLATION_CLOSEST: + return interp_3d_closest(acc, x, y, z); + case INTERPOLATION_LINEAR: + return interp_3d_linear(acc, x, y, z); + default: + return interp_3d_cubic(acc, x, y, z); + } + } +}; +#endif /* WITH_NANOVDB */ + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, int interp) +{ + const TextureInfo &info = kernel_data_fetch(texture_info, id); + + if (info.use_transform_3d) { + Transform tfm = info.transform_3d; + P = transform_point(&tfm, P); + } + + float x = P.x; + float y = P.y; + float z = P.z; + + uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp; + +#ifdef WITH_NANOVDB + if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) { + return NanoVDBInterpolator<float>::interp_3d(info, x, y, z, interpolation); + } + else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, x, y, z, interpolation); + } + else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN) { + return NanoVDBInterpolator<nanovdb::FpN>::interp_3d(info, x, y, z, interpolation); + } + else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) { + return NanoVDBInterpolator<nanovdb::Fp16>::interp_3d(info, x, y, z, interpolation); + } +#else + if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT || + info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 || + info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN || + info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) { + return make_float4( + TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A); + } +#endif + else { + if (info.extension == EXTENSION_CLIP) { + if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + + x *= info.width; + y *= info.height; + z *= info.depth; + } + + if (interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ + int ix, iy, iz; + svm_image_texture_frac(x, &ix); + svm_image_texture_frac(y, &iy); + svm_image_texture_frac(z, &iz); + + return svm_image_texture_read_3d(id, ix, iy, iz); + } + else if (interpolation == INTERPOLATION_LINEAR) { + /* Trilinear interpolation. */ + int ix, iy, iz; + float tx = svm_image_texture_frac(x - 0.5f, &ix); + float ty = svm_image_texture_frac(y - 0.5f, &iy); + float tz = svm_image_texture_frac(z - 0.5f, &iz); + + float4 r; + r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz); + r += (1.0f - tz) * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz); + r += (1.0f - tz) * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz); + r += (1.0f - tz) * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz); + + r += tz * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz + 1); + r += tz * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz + 1); + r += tz * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz + 1); + r += tz * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz + 1); + return r; + } + else { + /* Tricubic interpolation. */ + int ix, iy, iz; + float tx = svm_image_texture_frac(x - 0.5f, &ix); + float ty = svm_image_texture_frac(y - 0.5f, &iy); + float tz = svm_image_texture_frac(z - 0.5f, &iz); + + float u[4], v[4], w[4]; + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + for (int z = 0; z < 4; z++) { + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + float weight = u[x] * v[y] * w[z]; + r += weight * svm_image_texture_read_3d(id, ix + x - 1, iy + y - 1, iz + z - 1); + } + } + } + return r; + } +} + +#undef SET_CUBIC_SPLINE_WEIGHTS + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp new file mode 100644 index 00000000000..62affe6e58e --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -0,0 +1,884 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_ONEAPI + +/* clang-format off */ +# include "kernel.h" +# include <iostream> +# include <map> +# include <set> + +# include <level_zero/ze_api.h> +# include <CL/sycl.hpp> +# include <ext/oneapi/backend/level_zero.hpp> + +# include "kernel/device/oneapi/compat.h" +# include "kernel/device/oneapi/device_id.h" +# include "kernel/device/oneapi/globals.h" +# include "kernel/device/oneapi/kernel_templates.h" + +# include "kernel/device/gpu/kernel.h" +/* clang-format on */ + +static OneAPIErrorCallback s_error_cb = nullptr; +static void *s_error_user_ptr = nullptr; + +static std::vector<sycl::device> oneapi_available_devices(); + +void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr) +{ + s_error_cb = cb; + s_error_user_ptr = user_ptr; +} + +void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false) +{ +# ifdef _DEBUG + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + sycl::info::device_type device_type = + queue->get_device().get_info<sycl::info::device::device_type>(); + sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); + (void)usm_type; + assert(usm_type == sycl::usm::alloc::device || + ((device_type == sycl::info::device_type::host || + device_type == sycl::info::device_type::is_cpu || allow_host) && + usm_type == sycl::usm::alloc::host)); +# endif +} + +bool oneapi_create_queue(SyclQueue *&external_queue, int device_index) +{ + bool finished_correct = true; + try { + std::vector<sycl::device> devices = oneapi_available_devices(); + if (device_index < 0 || device_index >= devices.size()) { + return false; + } + sycl::queue *created_queue = new sycl::queue(devices[device_index], + sycl::property::queue::in_order()); + external_queue = reinterpret_cast<SyclQueue *>(created_queue); + } + catch (sycl::exception const &e) { + finished_correct = false; + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + } + return finished_correct; +} + +void oneapi_free_queue(SyclQueue *queue_) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + delete queue; +} + +void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + return sycl::aligned_alloc_host(alignment, memory_size, *queue); +} + +void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + return sycl::malloc_device(memory_size, *queue); +} + +void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + oneapi_check_usm(queue_, usm_ptr, true); + sycl::free(usm_ptr, *queue); +} + +bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + oneapi_check_usm(queue_, dest, true); + oneapi_check_usm(queue_, src, true); + try { + sycl::event mem_event = queue->memcpy(dest, src, num_bytes); + mem_event.wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +} + +bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + oneapi_check_usm(queue_, usm_ptr, true); + try { + sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes); + mem_event.wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +} + +bool oneapi_queue_synchronize(SyclQueue *queue_) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + try { + queue->wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +} + +/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and + * also trigger runtime compilation of all existing oneAPI kernels */ +bool oneapi_run_test_kernel(SyclQueue *queue_) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + size_t N = 8; + sycl::buffer<float, 1> A(N); + sycl::buffer<float, 1> B(N); + + { + sycl::host_accessor A_host_acc(A, sycl::write_only); + for (size_t i = (size_t)0; i < N; i++) + A_host_acc[i] = rand() % 32; + } + + try { + queue->submit([&](sycl::handler &cgh) { + sycl::accessor A_acc(A, cgh, sycl::read_only); + sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init); + + cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); }); + }); + queue->wait_and_throw(); + + sycl::host_accessor A_host_acc(A, sycl::read_only); + sycl::host_accessor B_host_acc(B, sycl::read_only); + + for (size_t i = (size_t)0; i < N; i++) { + float result = A_host_acc[i] + B_host_acc[i]; + (void)result; + } + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } + + return true; +} + +bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size) +{ + kernel_global_size = sizeof(KernelGlobalsGPU); + + return true; +} + +void oneapi_set_global_memory(SyclQueue *queue_, + void *kernel_globals, + const char *memory_name, + void *memory_device_pointer) +{ + assert(queue_); + assert(kernel_globals); + assert(memory_name); + assert(memory_device_pointer); + KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals; + oneapi_check_usm(queue_, memory_device_pointer); + oneapi_check_usm(queue_, kernel_globals, true); + + std::string matched_name(memory_name); + +/* This macro will change global ptr of KernelGlobals via name matching. */ +# define KERNEL_DATA_ARRAY(type, name) \ + else if (#name == matched_name) \ + { \ + globals->__##name = (type *)memory_device_pointer; \ + return; \ + } + if (false) { + } + else if ("integrator_state" == matched_name) { + globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer; + return; + } + KERNEL_DATA_ARRAY(KernelData, data) +# include "kernel/data_arrays.h" + else + { + std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!" + << std::endl; + assert(false); + } +# undef KERNEL_DATA_ARRAY +} + +/* TODO: Move device information to OneapiDevice initialized on creation and use it. */ +/* TODO: Move below function to oneapi/queue.cpp. */ +size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, + const DeviceKernel kernel, + const size_t kernel_global_size) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + (void)kernel_global_size; + const static size_t preferred_work_group_size_intersect_shading = 32; + const static size_t preferred_work_group_size_technical = 1024; + + size_t preferred_work_group_size = 0; + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + preferred_work_group_size = preferred_work_group_size_intersect_shading; + break; + + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: + case DEVICE_KERNEL_INTEGRATOR_RESET: + case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: + preferred_work_group_size = preferred_work_group_size_technical; + break; + + default: + preferred_work_group_size = 512; + } + + const size_t limit_work_group_size = + queue->get_device().get_info<sycl::info::device::max_work_group_size>(); + return std::min(limit_work_group_size, preferred_work_group_size); +} + +bool oneapi_enqueue_kernel(KernelContext *kernel_context, + int kernel, + size_t global_size, + void **args) +{ + bool success = true; + ::DeviceKernel device_kernel = (::DeviceKernel)kernel; + KernelGlobalsGPU *kg = (KernelGlobalsGPU *)kernel_context->kernel_globals; + sycl::queue *queue = reinterpret_cast<sycl::queue *>(kernel_context->queue); + assert(queue); + if (!queue) { + return false; + } + + size_t local_size = oneapi_kernel_preferred_local_size( + kernel_context->queue, device_kernel, global_size); + assert(global_size % local_size == 0); + + /* Local size for DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY needs to be enforced so we + * overwrite it outside of oneapi_kernel_preferred_local_size. */ + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY) { + local_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE; + } + + /* Kernels listed below need a specific number of work groups. */ + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY) { + int num_states = *((int *)(args[0])); + /* Round up to the next work-group. */ + size_t groups_count = (num_states + local_size - 1) / local_size; + /* NOTE(@nsirgien): As for now non-uniform workgroups don't work on most oneAPI devices, we + * extend work size to fit uniformity requirements. */ + global_size = groups_count * local_size; + +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED + if (queue->get_device().is_host()) { + global_size = 1; + local_size = 1; + } +# endif + } + + /* Let the compiler throw an error if there are any kernels missing in this implementation. */ +# if defined(_WIN32) +# pragma warning(error : 4062) +# elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic error "-Wswitch" +# endif + + try { + queue->submit([&](sycl::handler &cgh) { + switch (device_kernel) { + case DEVICE_KERNEL_INTEGRATOR_RESET: { + oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_camera); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_bake); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_closest); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_shadow); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_intersect_subsurface); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_intersect_volume_stack); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_background); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_light); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_shadow); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_shade_surface_raytrace); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface_mnee); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_volume); + break; + } + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_queued_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_queued_shadow_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_active_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_terminated_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_terminated_shadow_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_compact_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_compact_shadow_paths_array); + break; + } + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_adaptive_sampling_convergence_check); + break; + } + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_x); + break; + } + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_y); + break; + } + case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: { + oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_displace); + break; + } + case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_background); + break; + } + case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_shader_eval_curve_shadow_transparency); + break; + } + case DEVICE_KERNEL_PREFIX_SUM: { + oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_prefix_sum); + break; + } + + /* clang-format off */ + # define DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \ + case DEVICE_KERNEL_FILM_CONVERT_##VARIANT: { \ + oneapi_call(kg, cgh, \ + global_size, \ + local_size, \ + args, \ + oneapi_kernel_film_convert_##variant); \ + break; \ + } + +# define DEVICE_KERNEL_FILM_CONVERT(variant, VARIANT) \ + DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \ + DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT##_HALF_RGBA, variant##_half_rgba) + + DEVICE_KERNEL_FILM_CONVERT(depth, DEPTH); + DEVICE_KERNEL_FILM_CONVERT(mist, MIST); + DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT); + DEVICE_KERNEL_FILM_CONVERT(float, FLOAT); + DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH); + DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3); + DEVICE_KERNEL_FILM_CONVERT(motion, MOTION); + DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE); + DEVICE_KERNEL_FILM_CONVERT(shadow_catcher, SHADOW_CATCHER); + DEVICE_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow, + SHADOW_CATCHER_MATTE_WITH_SHADOW); + DEVICE_KERNEL_FILM_CONVERT(combined, COMBINED); + DEVICE_KERNEL_FILM_CONVERT(float4, FLOAT4); + +# undef DEVICE_KERNEL_FILM_CONVERT +# undef DEVICE_KERNEL_FILM_CONVERT_PARTIAL + /* clang-format on */ + + case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_filter_guiding_preprocess); + break; + } + case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_filter_guiding_set_fake_albedo); + break; + } + case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_preprocess); + break; + } + case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_postprocess); + break; + } + case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_cryptomatte_postprocess); + break; + } + case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_compact_states); + break; + } + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_compact_shadow_states); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_shadow_catcher_count_possible_splits); + break; + } + /* Unsupported kernels */ + case DEVICE_KERNEL_NUM: + case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL: + assert(0); + return false; + } + + /* Unknown kernel. */ + assert(0); + return false; + }); + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + success = false; + } + } + +# if defined(_WIN32) +# pragma warning(default : 4062) +# elif defined(__GNUC__) +# pragma GCC diagnostic pop +# endif + return success; +} + +static const int lowest_supported_driver_version_win = 1011660; +static const int lowest_supported_driver_version_neo = 20066; + +static int parse_driver_build_version(const sycl::device &device) +{ + const std::string &driver_version = device.get_info<sycl::info::device::driver_version>(); + int driver_build_version = 0; + + size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1); + if (second_dot_position == std::string::npos) { + std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version + << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," + << " xx.xx.xxx.xxxx (Windows) for device \"" + << device.get_info<sycl::info::device::name>() << "\"." << std::endl; + } + else { + try { + size_t third_dot_position = driver_version.find('.', second_dot_position + 1); + if (third_dot_position != std::string::npos) { + const std::string &third_number_substr = driver_version.substr( + second_dot_position + 1, third_dot_position - second_dot_position - 1); + const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1); + if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) + driver_build_version = std::stoi(third_number_substr) * 10000 + + std::stoi(forth_number_substr); + } + else { + const std::string &third_number_substr = driver_version.substr(second_dot_position + 1); + driver_build_version = std::stoi(third_number_substr); + } + } + catch (std::invalid_argument &e) { + std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version + << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," + << " xx.xx.xxx.xxxx (Windows) for device \"" + << device.get_info<sycl::info::device::name>() << "\"." << std::endl; + } + } + + return driver_build_version; +} + +static std::vector<sycl::device> oneapi_available_devices() +{ + bool allow_all_devices = false; + if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) + allow_all_devices = true; + + /* Host device is useful only for debugging at the moment + * so we hide this device with default build settings. */ +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED + bool allow_host = true; +# else + bool allow_host = false; +# endif + + const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms(); + + std::vector<sycl::device> available_devices; + for (const sycl::platform &platform : oneapi_platforms) { + /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL. + */ + if (platform.get_backend() == sycl::backend::opencl) { + continue; + } + + const std::vector<sycl::device> &oneapi_devices = + (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : + platform.get_devices(sycl::info::device_type::gpu); + + for (const sycl::device &device : oneapi_devices) { + if (allow_all_devices) { + /* still filter out host device if build doesn't support it. */ + if (allow_host || !device.is_host()) { + available_devices.push_back(device); + } + } + else { + bool filter_out = false; + + /* For now we support all Intel(R) Arc(TM) devices + * and any future GPU with more than 128 execution units + * official support can be broaden to older and smaller GPUs once ready. */ + if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) { + ze_device_handle_t ze_device = sycl::get_native<sycl::backend::ext_oneapi_level_zero>( + device); + ze_device_properties_t props = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; + zeDeviceGetProperties(ze_device, &props); + bool is_dg2 = (intel_arc_alchemist_device_ids.find(props.deviceId) != + intel_arc_alchemist_device_ids.end()); + int number_of_eus = props.numEUsPerSubslice * props.numSubslicesPerSlice * + props.numSlices; + if (!is_dg2 || number_of_eus < 128) + filter_out = true; + + /* if not already filtered out, check driver version. */ + if (!filter_out) { + int driver_build_version = parse_driver_build_version(device); + if ((driver_build_version > 100000 && + driver_build_version < lowest_supported_driver_version_win) || + (driver_build_version > 0 && + driver_build_version < lowest_supported_driver_version_neo)) { + filter_out = true; + } + } + } + else if (!allow_host && device.is_host()) { + filter_out = true; + } + else if (!allow_all_devices) { + filter_out = true; + } + + if (!filter_out) { + available_devices.push_back(device); + } + } + } + } + + return available_devices; +} + +char *oneapi_device_capabilities() +{ + std::stringstream capabilities; + + const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices(); + for (const sycl::device &device : oneapi_devices) { + const std::string &name = device.get_info<sycl::info::device::name>(); + + capabilities << std::string("\t") << name << "\n"; +# define WRITE_ATTR(attribute_name, attribute_variable) \ + capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \ + << "\n"; +# define GET_NUM_ATTR(attribute) \ + { \ + size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \ + capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \ + } + + GET_NUM_ATTR(vendor_id) + GET_NUM_ATTR(max_compute_units) + GET_NUM_ATTR(max_work_item_dimensions) + + sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>(); + WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0))) + WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1))) + WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2))) + + GET_NUM_ATTR(max_work_group_size) + GET_NUM_ATTR(max_num_sub_groups) + GET_NUM_ATTR(sub_group_independent_forward_progress) + + GET_NUM_ATTR(preferred_vector_width_char) + GET_NUM_ATTR(preferred_vector_width_short) + GET_NUM_ATTR(preferred_vector_width_int) + GET_NUM_ATTR(preferred_vector_width_long) + GET_NUM_ATTR(preferred_vector_width_float) + GET_NUM_ATTR(preferred_vector_width_double) + GET_NUM_ATTR(preferred_vector_width_half) + + GET_NUM_ATTR(native_vector_width_char) + GET_NUM_ATTR(native_vector_width_short) + GET_NUM_ATTR(native_vector_width_int) + GET_NUM_ATTR(native_vector_width_long) + GET_NUM_ATTR(native_vector_width_float) + GET_NUM_ATTR(native_vector_width_double) + GET_NUM_ATTR(native_vector_width_half) + + size_t max_clock_frequency = + (size_t)(device.is_host() ? (size_t)0 : + device.get_info<sycl::info::device::max_clock_frequency>()); + WRITE_ATTR("max_clock_frequency", max_clock_frequency) + + GET_NUM_ATTR(address_bits) + GET_NUM_ATTR(max_mem_alloc_size) + + /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't + * supported so we always return false, even if device supports HW texture usage acceleration. + */ + bool image_support = false; + WRITE_ATTR("image_support", (size_t)image_support) + + GET_NUM_ATTR(max_parameter_size) + GET_NUM_ATTR(mem_base_addr_align) + GET_NUM_ATTR(global_mem_size) + GET_NUM_ATTR(local_mem_size) + GET_NUM_ATTR(error_correction_support) + GET_NUM_ATTR(profiling_timer_resolution) + GET_NUM_ATTR(is_available) + +# undef GET_NUM_ATTR +# undef WRITE_ATTR + capabilities << "\n"; + } + + return ::strdup(capabilities.str().c_str()); +} + +void oneapi_free(void *p) +{ + if (p) { + ::free(p); + } +} + +void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr) +{ + int num = 0; + std::vector<sycl::device> devices = oneapi_available_devices(); + for (sycl::device &device : devices) { + const std::string &platform_name = + device.get_platform().get_info<sycl::info::platform::name>(); + std::string name = device.get_info<sycl::info::device::name>(); + std::string id = "ONEAPI_" + platform_name + "_" + name; + (cb)(id.c_str(), name.c_str(), num, user_ptr); + num++; + } +} + +size_t oneapi_get_memcapacity(SyclQueue *queue) +{ + return reinterpret_cast<sycl::queue *>(queue) + ->get_device() + .get_info<sycl::info::device::global_mem_size>(); +} + +size_t oneapi_get_compute_units_amount(SyclQueue *queue) +{ + return reinterpret_cast<sycl::queue *>(queue) + ->get_device() + .get_info<sycl::info::device::max_compute_units>(); +} + +#endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h new file mode 100644 index 00000000000..c5f853742ed --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/kernel.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +#ifdef WITH_ONEAPI + +# include <stddef.h> + +/* NOTE(@nsirgien): Should match underlying type in the declaration inside "kernel/types.h" + * TODO: use kernel/types.h directly. */ +enum DeviceKernel : int; + +# ifndef CYCLES_KERNEL_ONEAPI_EXPORT +# ifdef _WIN32 +# if defined(ONEAPI_EXPORT) +# define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllexport) +# else +# define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllimport) +# endif +# else +# define CYCLES_KERNEL_ONEAPI_EXPORT +# endif +# endif + +class SyclQueue; + +typedef void (*OneAPIDeviceIteratorCallback)(const char *id, + const char *name, + int num, + void *user_ptr); + +typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr); + +struct KernelContext { + /* Queue, associated with selected device */ + SyclQueue *queue; + /* Pointer to USM device memory with all global/constant allocation on this device */ + void *kernel_globals; +}; + +/* Use extern C linking so that the symbols can be easily load from the dynamic library at runtime. + */ +# ifdef __cplusplus +extern "C" { +# endif + +# define DLL_INTERFACE_CALL(function, return_type, ...) \ + CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__); +# include "kernel/device/oneapi/dll_interface_template.h" +# undef DLL_INTERFACE_CALL + +# ifdef __cplusplus +} +# endif + +#endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h new file mode 100644 index 00000000000..2dfc96292ed --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h @@ -0,0 +1,121 @@ +#pragma once + +/* Some macro magic to generate templates for kernel arguments. + The resulting oneapi_call() template allows to call a SYCL/C++ kernel + with typed arguments by only giving it a void **args as given by Cycles. + The template will automatically cast from void* to the expectd type. + */ + +/* When expanded by the preprocessor, the generated templates will look like this example: */ +#if 0 +template<typename T0, typename T1, typename T2> +void oneapi_call( + KernelGlobalsGPU *kg, + sycl::handler &cgh, + size_t global_size, + size_t local_size, + void **args, + void (*func)(const KernelGlobalsGPU *, size_t, size_t, sycl::handler &, T0, T1, T2)) +{ + func(kg, global_size, local_size, cgh, *(T0 *)(args[0]), *(T1 *)(args[1]), *(T2 *)(args[2])); +} +#endif + +/* clang-format off */ +#define ONEAPI_TYP(x) typename T##x +#define ONEAPI_CAST(x) *(T##x *)(args[x]) +#define ONEAPI_T(x) T##x + +#define ONEAPI_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, N, ...) N +#define ONEAPI_0(_call, ...) +#define ONEAPI_1(_call, x) _call(x) +#define ONEAPI_2(_call, x, ...) _call(x), ONEAPI_1(_call, __VA_ARGS__) +#define ONEAPI_3(_call, x, ...) _call(x), ONEAPI_2(_call, __VA_ARGS__) +#define ONEAPI_4(_call, x, ...) _call(x), ONEAPI_3(_call, __VA_ARGS__) +#define ONEAPI_5(_call, x, ...) _call(x), ONEAPI_4(_call, __VA_ARGS__) +#define ONEAPI_6(_call, x, ...) _call(x), ONEAPI_5(_call, __VA_ARGS__) +#define ONEAPI_7(_call, x, ...) _call(x), ONEAPI_6(_call, __VA_ARGS__) +#define ONEAPI_8(_call, x, ...) _call(x), ONEAPI_7(_call, __VA_ARGS__) +#define ONEAPI_9(_call, x, ...) _call(x), ONEAPI_8(_call, __VA_ARGS__) +#define ONEAPI_10(_call, x, ...) _call(x), ONEAPI_9(_call, __VA_ARGS__) +#define ONEAPI_11(_call, x, ...) _call(x), ONEAPI_10(_call, __VA_ARGS__) +#define ONEAPI_12(_call, x, ...) _call(x), ONEAPI_11(_call, __VA_ARGS__) +#define ONEAPI_13(_call, x, ...) _call(x), ONEAPI_12(_call, __VA_ARGS__) +#define ONEAPI_14(_call, x, ...) _call(x), ONEAPI_13(_call, __VA_ARGS__) +#define ONEAPI_15(_call, x, ...) _call(x), ONEAPI_14(_call, __VA_ARGS__) +#define ONEAPI_16(_call, x, ...) _call(x), ONEAPI_15(_call, __VA_ARGS__) +#define ONEAPI_17(_call, x, ...) _call(x), ONEAPI_16(_call, __VA_ARGS__) +#define ONEAPI_18(_call, x, ...) _call(x), ONEAPI_17(_call, __VA_ARGS__) +#define ONEAPI_19(_call, x, ...) _call(x), ONEAPI_18(_call, __VA_ARGS__) +#define ONEAPI_20(_call, x, ...) _call(x), ONEAPI_19(_call, __VA_ARGS__) +#define ONEAPI_21(_call, x, ...) _call(x), ONEAPI_20(_call, __VA_ARGS__) + +#define ONEAPI_CALL_FOR(x, ...) \ + ONEAPI_GET_NTH_ARG("ignored", \ + ##__VA_ARGS__, \ + ONEAPI_21, \ + ONEAPI_20, \ + ONEAPI_19, \ + ONEAPI_18, \ + ONEAPI_17, \ + ONEAPI_16, \ + ONEAPI_15, \ + ONEAPI_14, \ + ONEAPI_13, \ + ONEAPI_12, \ + ONEAPI_11, \ + ONEAPI_10, \ + ONEAPI_9, \ + ONEAPI_8, \ + ONEAPI_7, \ + ONEAPI_6, \ + ONEAPI_5, \ + ONEAPI_4, \ + ONEAPI_3, \ + ONEAPI_2, \ + ONEAPI_1, \ + ONEAPI_0) \ + (x, ##__VA_ARGS__) + +/* This template automatically casts entries in the void **args array to the types requested by the kernel func. + Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */ +#define oneapi_template(...) \ + template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \ + void oneapi_call( \ + KernelGlobalsGPU *kg, \ + sycl::handler &cgh, \ + size_t global_size, \ + size_t local_size, \ + void **args, \ + void (*func)(KernelGlobalsGPU*, size_t, size_t, sycl::handler &, ONEAPI_CALL_FOR(ONEAPI_T, __VA_ARGS__))) \ + { \ + func(kg, \ + global_size, \ + local_size, \ + cgh, \ + ONEAPI_CALL_FOR(ONEAPI_CAST, __VA_ARGS__)); \ + } + +oneapi_template(0) +oneapi_template(0, 1) +oneapi_template(0, 1, 2) +oneapi_template(0, 1, 2, 3) +oneapi_template(0, 1, 2, 3, 4) +oneapi_template(0, 1, 2, 3, 4, 5) +oneapi_template(0, 1, 2, 3, 4, 5, 6) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20) + + /* clang-format on */ diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index ad022716207..f2e61d25002 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -1571,7 +1571,7 @@ static_assert_align(KernelShaderEvalInput, 16); * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified. * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */ -typedef enum DeviceKernel { +typedef enum DeviceKernel : int { DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0, DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST, diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h index f89eb28b0b7..1ebf085ae13 100644 --- a/intern/cycles/util/atomic.h +++ b/intern/cycles/util/atomic.h @@ -106,6 +106,116 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float # endif /* __KERNEL_METAL__ */ +# ifdef __KERNEL_ONEAPI__ + +ccl_device_inline float atomic_add_and_fetch_float(ccl_global float *p, float x) +{ + sycl::atomic_ref<float, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_add(x); +} + +ccl_device_inline float atomic_compare_and_swap_float(ccl_global float *source, + float old_val, + float new_val) +{ + sycl::atomic_ref<float, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*source); + atomic.compare_exchange_weak(old_val, new_val); + return old_val; +} + +ccl_device_inline unsigned int atomic_fetch_and_add_uint32(ccl_global unsigned int *p, + unsigned int x) +{ + sycl::atomic_ref<unsigned int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_add(x); +} + +ccl_device_inline int atomic_fetch_and_add_uint32(ccl_global int *p, int x) +{ + sycl::atomic_ref<int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_add(x); +} + +ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p, + unsigned int x) +{ + sycl::atomic_ref<unsigned int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_sub(x); +} + +ccl_device_inline int atomic_fetch_and_sub_uint32(ccl_global int *p, int x) +{ + sycl::atomic_ref<int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_sub(x); +} + +ccl_device_inline unsigned int atomic_fetch_and_inc_uint32(ccl_global unsigned int *p) +{ + return atomic_fetch_and_add_uint32(p, 1); +} + +ccl_device_inline int atomic_fetch_and_inc_uint32(ccl_global int *p) +{ + return atomic_fetch_and_add_uint32(p, 1); +} + +ccl_device_inline unsigned int atomic_fetch_and_dec_uint32(ccl_global unsigned int *p) +{ + return atomic_fetch_and_sub_uint32(p, 1); +} + +ccl_device_inline int atomic_fetch_and_dec_uint32(ccl_global int *p) +{ + return atomic_fetch_and_sub_uint32(p, 1); +} + +ccl_device_inline unsigned int atomic_fetch_and_or_uint32(ccl_global unsigned int *p, + unsigned int x) +{ + sycl::atomic_ref<unsigned int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_or(x); +} + +ccl_device_inline int atomic_fetch_and_or_uint32(ccl_global int *p, int x) +{ + sycl::atomic_ref<int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_or(x); +} + +# endif /* __KERNEL_ONEAPI__ */ + #endif /* __KERNEL_GPU__ */ #endif /* __UTIL_ATOMIC_H__ */ diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h index 434bc12d670..c668638eb02 100644 --- a/intern/cycles/util/half.h +++ b/intern/cycles/util/half.h @@ -35,7 +35,7 @@ ccl_device_inline float half_to_float(half h_in) #else /* CUDA has its own half data type, no need to define then */ -# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) +# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) && !defined(__KERNEL_ONEAPI__) /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from * unsigned shorts. */ class half { @@ -73,7 +73,7 @@ struct half4 { ccl_device_inline half float_to_half_image(float f) { -#if defined(__KERNEL_METAL__) +#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__) return half(min(f, 65504.0f)); #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __float2half(min(f, 65504.0f)); @@ -103,6 +103,8 @@ ccl_device_inline float half_to_float_image(half h) { #if defined(__KERNEL_METAL__) return half_to_float(h); +#elif defined(__KERNEL_ONEAPI__) + return float(h); #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __half2float(h); #else @@ -136,7 +138,7 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h) ccl_device_inline half float_to_half_display(const float f) { -#if defined(__KERNEL_METAL__) +#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__) return half(min(f, 65504.0f)); #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __float2half(min(f, 65504.0f)); diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index f1f627588c5..af2f1ea092d 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -79,7 +79,7 @@ CCL_NAMESPACE_BEGIN /* Scalar */ -#ifndef __HIP__ +#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__) # ifdef _WIN32 ccl_device_inline float fmaxf(float a, float b) { @@ -92,12 +92,18 @@ ccl_device_inline float fminf(float a, float b) } # endif /* _WIN32 */ -#endif /* __HIP__ */ +#endif /* __HIP__, __KERNEL_ONEAPI__ */ -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) +# ifndef __KERNEL_ONEAPI__ using std::isfinite; using std::isnan; using std::sqrt; +# else +using sycl::sqrt; +# define isfinite(x) sycl::isfinite((x)) +# define isnan(x) sycl::isnan((x)) +# endif ccl_device_inline int abs(int x) { @@ -793,6 +799,8 @@ ccl_device_inline uint popcount(uint x) return i & 1; } # endif +#elif defined(__KERNEL_ONEAPI__) +# define popcount(x) sycl::popcount(x) #elif defined(__KERNEL_HIP__) /* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */ # define popcount(x) __popcll(x) @@ -806,6 +814,8 @@ ccl_device_inline uint count_leading_zeros(uint x) return __clz(x); #elif defined(__KERNEL_METAL__) return clz(x); +#elif defined(__KERNEL_ONEAPI__) + return sycl::clz(x); #else assert(x != 0); # ifdef _MSC_VER @@ -824,6 +834,8 @@ ccl_device_inline uint count_trailing_zeros(uint x) return (__ffs(x) - 1); #elif defined(__KERNEL_METAL__) return ctz(x); +#elif defined(__KERNEL_ONEAPI__) + return sycl::ctz(x); #else assert(x != 0); # ifdef _MSC_VER diff --git a/intern/cycles/util/types_float2.h b/intern/cycles/util/types_float2.h index d8b2efb7b4b..07b9ec0986b 100644 --- a/intern/cycles/util/types_float2.h +++ b/intern/cycles/util/types_float2.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct float2 { float x, y; @@ -20,7 +20,7 @@ struct float2 { ccl_device_inline float2 make_float2(float x, float y); ccl_device_inline void print_float2(const char *label, const float2 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float2_impl.h b/intern/cycles/util/types_float2_impl.h index d67ec946b79..45fc90c52bd 100644 --- a/intern/cycles/util/types_float2_impl.h +++ b/intern/cycles/util/types_float2_impl.h @@ -14,7 +14,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) __forceinline float float2::operator[](int i) const { util_assert(i >= 0); @@ -39,7 +39,7 @@ ccl_device_inline void print_float2(const char *label, const float2 &a) { printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y); } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h index 060c2ac4152..c7900acaa69 100644 --- a/intern/cycles/util/types_float3.h +++ b/intern/cycles/util/types_float3.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) struct ccl_try_align(16) float3 { # ifdef __KERNEL_SSE__ @@ -40,7 +40,7 @@ struct ccl_try_align(16) float3 ccl_device_inline float3 make_float3(float f); ccl_device_inline float3 make_float3(float x, float y, float z); ccl_device_inline void print_float3(const char *label, const float3 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) */ /* Smaller float3 for storage. For math operations this must be converted to float3, so that on the * CPU SIMD instructions can be used. */ diff --git a/intern/cycles/util/types_float3_impl.h b/intern/cycles/util/types_float3_impl.h index f5ffc48c1be..2e6e864c8ea 100644 --- a/intern/cycles/util/types_float3_impl.h +++ b/intern/cycles/util/types_float3_impl.h @@ -14,7 +14,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) # ifdef __KERNEL_SSE__ __forceinline float3::float3() { @@ -83,7 +83,7 @@ ccl_device_inline void print_float3(const char *label, const float3 &a) { printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z); } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float4.h b/intern/cycles/util/types_float4.h index 68ba787dac0..27453bf39e4 100644 --- a/intern/cycles/util/types_float4.h +++ b/intern/cycles/util/types_float4.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct int4; struct ccl_try_align(16) float4 @@ -43,7 +43,7 @@ ccl_device_inline float4 make_float4(float f); ccl_device_inline float4 make_float4(float x, float y, float z, float w); ccl_device_inline float4 make_float4(const int4 &i); ccl_device_inline void print_float4(const char *label, const float4 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float4_impl.h b/intern/cycles/util/types_float4_impl.h index de2e7cb7061..d7858f744e3 100644 --- a/intern/cycles/util/types_float4_impl.h +++ b/intern/cycles/util/types_float4_impl.h @@ -14,7 +14,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) # ifdef __KERNEL_SSE__ __forceinline float4::float4() { @@ -89,7 +89,7 @@ ccl_device_inline void print_float4(const char *label, const float4 &a) { printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w); } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h index 99f9ec9b867..d71149946f7 100644 --- a/intern/cycles/util/types_float8.h +++ b/intern/cycles/util/types_float8.h @@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct ccl_try_align(32) float8 { @@ -43,7 +43,7 @@ struct ccl_try_align(32) float8 ccl_device_inline float8 make_float8(float f); ccl_device_inline float8 make_float8(float a, float b, float c, float d, float e, float f, float g, float h); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h index 19818976b50..0694f5205a5 100644 --- a/intern/cycles/util/types_float8_impl.h +++ b/intern/cycles/util/types_float8_impl.h @@ -15,7 +15,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) # ifdef __KERNEL_AVX2__ __forceinline float8::float8() { @@ -81,7 +81,7 @@ make_float8(float a, float b, float c, float d, float e, float f, float g, float return r; } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int2.h b/intern/cycles/util/types_int2.h index 4daf387d9cf..bf69cddc653 100644 --- a/intern/cycles/util/types_int2.h +++ b/intern/cycles/util/types_int2.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct int2 { int x, y; @@ -19,7 +19,7 @@ struct int2 { }; ccl_device_inline int2 make_int2(int x, int y); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int2_impl.h b/intern/cycles/util/types_int2_impl.h index 7989c4d5506..7bdc77369ee 100644 --- a/intern/cycles/util/types_int2_impl.h +++ b/intern/cycles/util/types_int2_impl.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) int int2::operator[](int i) const { util_assert(i >= 0); @@ -30,7 +30,7 @@ ccl_device_inline int2 make_int2(int x, int y) int2 a = {x, y}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int3.h b/intern/cycles/util/types_int3.h index ad9bcb39bbe..f88ff22ac35 100644 --- a/intern/cycles/util/types_int3.h +++ b/intern/cycles/util/types_int3.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct ccl_try_align(16) int3 { # ifdef __KERNEL_SSE__ @@ -40,7 +40,7 @@ struct ccl_try_align(16) int3 ccl_device_inline int3 make_int3(int i); ccl_device_inline int3 make_int3(int x, int y, int z); ccl_device_inline void print_int3(const char *label, const int3 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int3_impl.h b/intern/cycles/util/types_int3_impl.h index 4cfc1cf2987..1c49e97ad32 100644 --- a/intern/cycles/util/types_int3_impl.h +++ b/intern/cycles/util/types_int3_impl.h @@ -14,7 +14,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) # ifdef __KERNEL_SSE__ __forceinline int3::int3() { @@ -84,7 +84,7 @@ ccl_device_inline void print_int3(const char *label, const int3 &a) { printf("%s: %d %d %d\n", label, a.x, a.y, a.z); } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int4.h b/intern/cycles/util/types_int4.h index f35632fb52f..9d557c01344 100644 --- a/intern/cycles/util/types_int4.h +++ b/intern/cycles/util/types_int4.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct float3; struct float4; @@ -46,7 +46,7 @@ ccl_device_inline int4 make_int4(int x, int y, int z, int w); ccl_device_inline int4 make_int4(const float3 &f); ccl_device_inline int4 make_int4(const float4 &f); ccl_device_inline void print_int4(const char *label, const int4 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int4_impl.h b/intern/cycles/util/types_int4_impl.h index adb4a4cebac..11e1ede6705 100644 --- a/intern/cycles/util/types_int4_impl.h +++ b/intern/cycles/util/types_int4_impl.h @@ -14,7 +14,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) # ifdef __KERNEL_SSE__ __forceinline int4::int4() { @@ -83,6 +83,8 @@ ccl_device_inline int4 make_int4(const float3 &f) { # ifdef __KERNEL_SSE__ int4 a(_mm_cvtps_epi32(f.m128)); +# elif defined(__KERNEL_ONEAPI__) + int4 a = {(int)f.x, (int)f.y, (int)f.z, 0}; # else int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; # endif @@ -103,7 +105,7 @@ ccl_device_inline void print_int4(const char *label, const int4 &a) { printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uchar2.h b/intern/cycles/util/types_uchar2.h index 445fa8dd703..0b3c9bd0331 100644 --- a/intern/cycles/util/types_uchar2.h +++ b/intern/cycles/util/types_uchar2.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct uchar2 { uchar x, y; @@ -19,7 +19,7 @@ struct uchar2 { }; ccl_device_inline uchar2 make_uchar2(uchar x, uchar y); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uchar2_impl.h b/intern/cycles/util/types_uchar2_impl.h index cec1c679050..a7254d5eaf2 100644 --- a/intern/cycles/util/types_uchar2_impl.h +++ b/intern/cycles/util/types_uchar2_impl.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) uchar uchar2::operator[](int i) const { util_assert(i >= 0); @@ -30,7 +30,7 @@ ccl_device_inline uchar2 make_uchar2(uchar x, uchar y) uchar2 a = {x, y}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uchar3.h b/intern/cycles/util/types_uchar3.h index 1ebd86441c3..fc213502ada 100644 --- a/intern/cycles/util/types_uchar3.h +++ b/intern/cycles/util/types_uchar3.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct uchar3 { uchar x, y, z; @@ -19,7 +19,7 @@ struct uchar3 { }; ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uchar3_impl.h b/intern/cycles/util/types_uchar3_impl.h index 0656baa3da4..0c24ffb488a 100644 --- a/intern/cycles/util/types_uchar3_impl.h +++ b/intern/cycles/util/types_uchar3_impl.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) uchar uchar3::operator[](int i) const { util_assert(i >= 0); @@ -30,7 +30,7 @@ ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z) uchar3 a = {x, y, z}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uchar4.h b/intern/cycles/util/types_uchar4.h index 2ac4fb56cbb..a2a2c945aaa 100644 --- a/intern/cycles/util/types_uchar4.h +++ b/intern/cycles/util/types_uchar4.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct uchar4 { uchar x, y, z, w; @@ -19,7 +19,7 @@ struct uchar4 { }; ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uchar4_impl.h b/intern/cycles/util/types_uchar4_impl.h index b3e8abfe873..8ec6213a37d 100644 --- a/intern/cycles/util/types_uchar4_impl.h +++ b/intern/cycles/util/types_uchar4_impl.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) uchar uchar4::operator[](int i) const { util_assert(i >= 0); @@ -30,7 +30,7 @@ ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w) uchar4 a = {x, y, z, w}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uint2.h b/intern/cycles/util/types_uint2.h index e3254b9f0e1..faa0955f903 100644 --- a/intern/cycles/util/types_uint2.h +++ b/intern/cycles/util/types_uint2.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct uint2 { uint x, y; @@ -19,7 +19,7 @@ struct uint2 { }; ccl_device_inline uint2 make_uint2(uint x, uint y); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uint2_impl.h b/intern/cycles/util/types_uint2_impl.h index e67134a011e..cac0ba6b531 100644 --- a/intern/cycles/util/types_uint2_impl.h +++ b/intern/cycles/util/types_uint2_impl.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) __forceinline uint uint2::operator[](uint i) const { util_assert(i < 2); @@ -28,7 +28,7 @@ ccl_device_inline uint2 make_uint2(uint x, uint y) uint2 a = {x, y}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uint3.h b/intern/cycles/util/types_uint3.h index 885a8fb84ce..3ff87bfc791 100644 --- a/intern/cycles/util/types_uint3.h +++ b/intern/cycles/util/types_uint3.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct uint3 { uint x, y, z; @@ -19,7 +19,7 @@ struct uint3 { }; ccl_device_inline uint3 make_uint3(uint x, uint y, uint z); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uint3_impl.h b/intern/cycles/util/types_uint3_impl.h index f4d3d72469c..221883a1adb 100644 --- a/intern/cycles/util/types_uint3_impl.h +++ b/intern/cycles/util/types_uint3_impl.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) __forceinline uint uint3::operator[](uint i) const { util_assert(i < 3); @@ -28,7 +28,7 @@ ccl_device_inline uint3 make_uint3(uint x, uint y, uint z) uint3 a = {x, y, z}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uint4.h b/intern/cycles/util/types_uint4.h index d582b91d2a0..504095b2383 100644 --- a/intern/cycles/util/types_uint4.h +++ b/intern/cycles/util/types_uint4.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct uint4 { uint x, y, z, w; @@ -19,7 +19,7 @@ struct uint4 { }; ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w); -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uint4_impl.h b/intern/cycles/util/types_uint4_impl.h index 98a4c5e9fe9..d78db944a1f 100644 --- a/intern/cycles/util/types_uint4_impl.h +++ b/intern/cycles/util/types_uint4_impl.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) __forceinline uint uint4::operator[](uint i) const { util_assert(i < 3); @@ -28,7 +28,7 @@ ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w) uint4 a = {x, y, z, w}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_ushort4.h b/intern/cycles/util/types_ushort4.h index 1766c6bf734..9a6e12095ba 100644 --- a/intern/cycles/util/types_ushort4.h +++ b/intern/cycles/util/types_ushort4.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) struct ushort4 { uint16_t x, y, z, w; |