Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/CMakeLists.txt29
-rw-r--r--intern/cycles/device/cpu/device_impl.cpp28
-rw-r--r--intern/cycles/device/cuda/device.cpp28
-rw-r--r--intern/cycles/device/cuda/device_impl.cpp68
-rw-r--r--intern/cycles/device/cuda/queue.cpp6
-rw-r--r--intern/cycles/device/device.cpp46
-rw-r--r--intern/cycles/device/device.h9
-rw-r--r--intern/cycles/device/hip/device.cpp25
-rw-r--r--intern/cycles/device/hip/device_impl.cpp57
-rw-r--r--intern/cycles/device/hip/queue.cpp6
-rw-r--r--intern/cycles/device/hip/util.h2
-rw-r--r--intern/cycles/device/memory.h2
-rw-r--r--intern/cycles/device/metal/device.mm14
-rw-r--r--intern/cycles/device/metal/device_impl.h12
-rw-r--r--intern/cycles/device/metal/device_impl.mm231
-rw-r--r--intern/cycles/device/metal/kernel.h30
-rw-r--r--intern/cycles/device/metal/kernel.mm221
-rw-r--r--intern/cycles/device/metal/queue.h1
-rw-r--r--intern/cycles/device/metal/queue.mm21
-rw-r--r--intern/cycles/device/metal/util.h12
-rw-r--r--intern/cycles/device/metal/util.mm74
-rw-r--r--intern/cycles/device/oneapi/device.cpp185
-rw-r--r--intern/cycles/device/oneapi/device.h24
-rw-r--r--intern/cycles/device/oneapi/device_impl.cpp446
-rw-r--r--intern/cycles/device/oneapi/device_impl.h104
-rw-r--r--intern/cycles/device/oneapi/dll_interface.h17
-rw-r--r--intern/cycles/device/oneapi/queue.cpp136
-rw-r--r--intern/cycles/device/oneapi/queue.h51
-rw-r--r--intern/cycles/device/optix/device.cpp6
-rw-r--r--intern/cycles/device/optix/device_impl.cpp70
-rw-r--r--intern/cycles/device/optix/queue.cpp1
-rw-r--r--intern/cycles/device/queue.cpp21
-rw-r--r--intern/cycles/device/queue.h7
33 files changed, 1681 insertions, 309 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 6205775260a..24855d795d1 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -3,12 +3,9 @@
set(INC
..
- ../../glew-mx
)
-set(INC_SYS
- ${GLEW_INCLUDE_DIR}
-)
+set(INC_SYS )
if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
if(WITH_CUDA_DYNLOAD)
@@ -22,6 +19,8 @@ if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
)
add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
endif()
+
+ add_definitions(-DCYCLES_RUNTIME_OPTIX_ROOT_DIR="${CYCLES_RUNTIME_OPTIX_ROOT_DIR}")
endif()
if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
@@ -82,6 +81,15 @@ set(SRC_HIP
hip/util.h
)
+set(SRC_ONEAPI
+ oneapi/device_impl.cpp
+ oneapi/device_impl.h
+ oneapi/device.cpp
+ oneapi/device.h
+ oneapi/queue.cpp
+ oneapi/queue.h
+)
+
set(SRC_DUMMY
dummy/device.cpp
dummy/device.h
@@ -134,13 +142,13 @@ set(SRC
${SRC_DUMMY}
${SRC_MULTI}
${SRC_OPTIX}
+ ${SRC_ONEAPI}
${SRC_HEADERS}
)
set(LIB
cycles_kernel
cycles_util
- ${CYCLES_GL_LIBRARIES}
)
if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
@@ -161,8 +169,6 @@ if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
)
endif()
-add_definitions(${GL_DEFINITIONS})
-
if(WITH_CYCLES_DEVICE_CUDA)
add_definitions(-DWITH_CUDA)
endif()
@@ -181,6 +187,9 @@ if(WITH_CYCLES_DEVICE_METAL)
${SRC_METAL}
)
endif()
+if (WITH_CYCLES_DEVICE_ONEAPI)
+ add_definitions(-DWITH_ONEAPI)
+endif()
if(WITH_OPENIMAGEDENOISE)
list(APPEND LIB
@@ -193,6 +202,11 @@ include_directories(SYSTEM ${INC_SYS})
cycles_add_library(cycles_device "${LIB}" ${SRC})
+if(WITH_CYCLES_DEVICE_ONEAPI)
+ # Need to have proper rebuilding in case of changes in cycles_kernel_oneapi due external project behaviour
+ add_dependencies(cycles_device cycles_kernel_oneapi)
+endif()
+
source_group("cpu" FILES ${SRC_CPU})
source_group("cuda" FILES ${SRC_CUDA})
source_group("dummy" FILES ${SRC_DUMMY})
@@ -200,4 +214,5 @@ source_group("hip" FILES ${SRC_HIP})
source_group("multi" FILES ${SRC_MULTI})
source_group("metal" FILES ${SRC_METAL})
source_group("optix" FILES ${SRC_OPTIX})
+source_group("oneapi" FILES ${SRC_ONEAPI})
source_group("common" FILES ${SRC_BASE} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index 612c391f7d5..1e4b9baa0c0 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -51,12 +51,12 @@
CCL_NAMESPACE_BEGIN
CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
- : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL)
+ : Device(info_, stats_, profiler_), texture_info(this, "texture_info", MEM_GLOBAL)
{
/* Pick any kernel, all of them are supposed to have same level of microarchitecture
* optimization. */
- VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
- << " CPU kernels.";
+ VLOG_INFO << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
+ << " CPU kernels.";
if (info.cpu_threads == 0) {
info.cpu_threads = TaskScheduler::max_concurrency();
@@ -111,9 +111,9 @@ void CPUDevice::mem_alloc(device_memory &mem)
}
else {
if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
}
if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
@@ -192,12 +192,12 @@ device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_
void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
{
#ifdef WITH_EMBREE
- if (strcmp(name, "__data") == 0) {
+ if (strcmp(name, "data") == 0) {
assert(size <= sizeof(KernelData));
// Update scene handle (since it is different for each device on multi devices)
KernelData *const data = (KernelData *)host;
- data->bvh.scene = embree_scene;
+ data->device_bvh = embree_scene;
}
#endif
kernel_const_copy(&kernel_globals, name, host, size);
@@ -205,9 +205,9 @@ void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
void CPUDevice::global_alloc(device_memory &mem)
{
- VLOG(1) << "Global memory allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Global memory allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
@@ -227,9 +227,9 @@ void CPUDevice::global_free(device_memory &mem)
void CPUDevice::tex_alloc(device_texture &mem)
{
- VLOG(1) << "Texture allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Texture allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
mem.device_pointer = (device_ptr)mem.host_pointer;
mem.device_size = mem.memory_size();
diff --git a/intern/cycles/device/cuda/device.cpp b/intern/cycles/device/cuda/device.cpp
index 400490336d6..5a213c45b71 100644
--- a/intern/cycles/device/cuda/device.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -29,24 +29,25 @@ bool device_cuda_init()
initialized = true;
int cuew_result = cuewInit(CUEW_INIT_CUDA);
if (cuew_result == CUEW_SUCCESS) {
- VLOG(1) << "CUEW initialization succeeded";
+ VLOG_INFO << "CUEW initialization succeeded";
if (CUDADevice::have_precompiled_kernels()) {
- VLOG(1) << "Found precompiled kernels";
+ VLOG_INFO << "Found precompiled kernels";
result = true;
}
else if (cuewCompilerPath() != NULL) {
- VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
+ VLOG_INFO << "Found CUDA compiler " << cuewCompilerPath();
result = true;
}
else {
- VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found,"
- << " unable to use CUDA";
+ VLOG_INFO << "Neither precompiled kernels nor CUDA compiler was found,"
+ << " unable to use CUDA";
}
}
else {
- VLOG(1) << "CUEW initialization failed: "
- << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
- "Error opening the library");
+ VLOG_WARNING << "CUEW initialization failed: "
+ << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) ?
+ "Error setting up atexit() handler" :
+ "Error opening the library");
}
return result;
@@ -121,7 +122,8 @@ void device_cuda_info(vector<DeviceInfo> &devices)
int major;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
if (major < 3) {
- VLOG(1) << "Ignoring device \"" << name << "\", this graphics card is no longer supported.";
+ VLOG_INFO << "Ignoring device \"" << name
+ << "\", this graphics card is no longer supported.";
continue;
}
@@ -166,21 +168,21 @@ void device_cuda_info(vector<DeviceInfo> &devices)
* Windows 10 even when it is, due to an issue in application profiles.
* Detect case where we expect it to be available and override. */
if (preempt_attr == 0 && (major >= 6) && system_windows_version_at_least(10, 17134)) {
- VLOG(1) << "Assuming device has compute preemption on Windows 10.";
+ VLOG_INFO << "Assuming device has compute preemption on Windows 10.";
preempt_attr = 1;
}
if (timeout_attr && !preempt_attr) {
- VLOG(1) << "Device is recognized as display.";
+ VLOG_INFO << "Device is recognized as display.";
info.description += " (Display)";
info.display_device = true;
display_devices.push_back(info);
}
else {
- VLOG(1) << "Device has compute preemption or is not used for display.";
+ VLOG_INFO << "Device has compute preemption or is not used for display.";
devices.push_back(info);
}
- VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+ VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
}
if (!display_devices.empty())
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index cb7e909a2d5..01c021551f3 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -23,6 +23,8 @@
# include "util/types.h"
# include "util/windows.h"
+# include "kernel/device/cuda/globals.h"
+
CCL_NAMESPACE_BEGIN
class CUDADevice;
@@ -51,7 +53,7 @@ void CUDADevice::set_error(const string &error)
}
CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
- : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+ : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
{
first_error = true;
@@ -244,9 +246,9 @@ string CUDADevice::compile_kernel(const uint kernel_features,
if (!use_adaptive_compilation()) {
if (!force_ptx) {
const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
- VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+ VLOG_INFO << "Testing for pre-compiled kernel " << cubin << ".";
if (path_exists(cubin)) {
- VLOG(1) << "Using precompiled kernel.";
+ VLOG_INFO << "Using precompiled kernel.";
return cubin;
}
}
@@ -256,9 +258,9 @@ string CUDADevice::compile_kernel(const uint kernel_features,
while (ptx_major >= 3) {
const string ptx = path_get(
string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
- VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+ VLOG_INFO << "Testing for pre-compiled kernel " << ptx << ".";
if (path_exists(ptx)) {
- VLOG(1) << "Using precompiled kernel.";
+ VLOG_INFO << "Using precompiled kernel.";
return ptx;
}
@@ -287,9 +289,9 @@ string CUDADevice::compile_kernel(const uint kernel_features,
const string cubin_file = string_printf(
"cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
const string cubin = path_cache_get(path_join("kernels", cubin_file));
- VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+ VLOG_INFO << "Testing for locally compiled kernel " << cubin << ".";
if (path_exists(cubin)) {
- VLOG(1) << "Using locally compiled kernel.";
+ VLOG_INFO << "Using locally compiled kernel.";
return cubin;
}
@@ -323,7 +325,7 @@ string CUDADevice::compile_kernel(const uint kernel_features,
}
const int nvcc_cuda_version = cuewCompilerVersion();
- VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+ VLOG_INFO << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
if (nvcc_cuda_version < 101) {
printf(
"Unsupported CUDA version %d.%d detected, "
@@ -399,7 +401,8 @@ bool CUDADevice::load_kernels(const uint kernel_features)
*/
if (cuModule) {
if (use_adaptive_compilation()) {
- VLOG(1) << "Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
+ VLOG_INFO
+ << "Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
}
return true;
}
@@ -481,8 +484,8 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
cuMemGetInfo(&free_after, &total);
}
- VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
- << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+ VLOG_INFO << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+ << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
# if 0
/* For testing mapped host memory, fill up device memory. */
@@ -513,7 +516,7 @@ void CUDADevice::init_host_memory()
}
}
else {
- VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+ VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
@@ -524,8 +527,8 @@ void CUDADevice::init_host_memory()
device_working_headroom = 32 * 1024 * 1024LL; // 32MB
device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
- VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
- << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+ VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+ << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void CUDADevice::load_texture_info()
@@ -593,7 +596,7 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
* multiple CUDA devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
- VLOG(1) << "Move memory from device to host: " << max_mem->name;
+ VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
@@ -701,9 +704,9 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
}
if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")" << status;
+ VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
@@ -899,9 +902,19 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
CUdeviceptr mem;
size_t bytes;
- cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
- // assert(bytes == size);
- cuda_assert(cuMemcpyHtoD(mem, host, size));
+ cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, "kernel_params"));
+ assert(bytes == sizeof(KernelParamsCUDA));
+
+ /* Update data storage pointers in launch parameters. */
+# define KERNEL_DATA_ARRAY(data_type, data_name) \
+ if (strcmp(name, #data_name) == 0) { \
+ cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \
+ return; \
+ }
+ KERNEL_DATA_ARRAY(KernelData, data)
+ KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
+# include "kernel/data_arrays.h"
+# undef KERNEL_DATA_ARRAY
}
void CUDADevice::global_alloc(device_memory &mem)
@@ -925,7 +938,6 @@ void CUDADevice::tex_alloc(device_texture &mem)
{
CUDAContextScope scope(this);
- string bind_name = mem.name;
size_t dsize = datatype_size(mem.data_type);
size_t size = mem.memory_size();
@@ -1008,9 +1020,9 @@ void CUDADevice::tex_alloc(device_texture &mem)
desc.NumChannels = mem.data_elements;
desc.Flags = 0;
- VLOG(1) << "Array 3D allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Array 3D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
cuda_assert(cuArray3DCreate(&array_3d, &desc));
@@ -1190,11 +1202,11 @@ bool CUDADevice::should_use_graphics_interop()
}
vector<CUdevice> gl_devices(num_all_devices);
- uint num_gl_devices;
+ uint num_gl_devices = 0;
cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
- for (CUdevice gl_device : gl_devices) {
- if (gl_device == cuDevice) {
+ for (uint i = 0; i < num_gl_devices; ++i) {
+ if (gl_devices[i] == cuDevice) {
return true;
}
}
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 38c71866ad0..5912e68a92b 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -39,12 +39,12 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
num_states = max((int)(num_states * factor), 1024);
}
else {
- VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
+ VLOG_DEVICE_STATS << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
}
}
- VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
- << string_human_readable_size(num_states * state_size);
+ VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
+ << string_human_readable_size(num_states * state_size);
return num_states;
}
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index ea5b3c6dc8c..ace6ed517f5 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -16,6 +16,7 @@
#include "device/hip/device.h"
#include "device/metal/device.h"
#include "device/multi/device.h"
+#include "device/oneapi/device.h"
#include "device/optix/device.h"
#include "util/foreach.h"
@@ -39,6 +40,7 @@ vector<DeviceInfo> Device::optix_devices;
vector<DeviceInfo> Device::cpu_devices;
vector<DeviceInfo> Device::hip_devices;
vector<DeviceInfo> Device::metal_devices;
+vector<DeviceInfo> Device::oneapi_devices;
uint Device::devices_initialized_mask = 0;
/* Device */
@@ -101,6 +103,13 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
device = device_metal_create(info, stats, profiler);
break;
#endif
+
+#ifdef WITH_ONEAPI
+ case DEVICE_ONEAPI:
+ device = device_oneapi_create(info, stats, profiler);
+ break;
+#endif
+
default:
break;
}
@@ -126,6 +135,8 @@ DeviceType Device::type_from_string(const char *name)
return DEVICE_HIP;
else if (strcmp(name, "METAL") == 0)
return DEVICE_METAL;
+ else if (strcmp(name, "ONEAPI") == 0)
+ return DEVICE_ONEAPI;
return DEVICE_NONE;
}
@@ -144,6 +155,8 @@ string Device::string_from_type(DeviceType type)
return "HIP";
else if (type == DEVICE_METAL)
return "METAL";
+ else if (type == DEVICE_ONEAPI)
+ return "ONEAPI";
return "";
}
@@ -164,6 +177,9 @@ vector<DeviceType> Device::available_types()
#ifdef WITH_METAL
types.push_back(DEVICE_METAL);
#endif
+#ifdef WITH_ONEAPI
+ types.push_back(DEVICE_ONEAPI);
+#endif
return types;
}
@@ -219,6 +235,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
}
#endif
+#ifdef WITH_ONEAPI
+ if (mask & DEVICE_MASK_ONEAPI) {
+ if (!(devices_initialized_mask & DEVICE_MASK_ONEAPI)) {
+ if (device_oneapi_init()) {
+ device_oneapi_info(oneapi_devices);
+ }
+ devices_initialized_mask |= DEVICE_MASK_ONEAPI;
+ }
+ foreach (DeviceInfo &info, oneapi_devices) {
+ devices.push_back(info);
+ }
+ }
+#endif
+
if (mask & DEVICE_MASK_CPU) {
if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
device_cpu_info(cpu_devices);
@@ -282,6 +312,15 @@ string Device::device_capabilities(uint mask)
}
#endif
+#ifdef WITH_ONEAPI
+ if (mask & DEVICE_MASK_ONEAPI) {
+ if (device_oneapi_init()) {
+ capabilities += "\noneAPI device capabilities:\n";
+ capabilities += device_oneapi_capabilities();
+ }
+ }
+#endif
+
#ifdef WITH_METAL
if (mask & DEVICE_MASK_METAL) {
if (device_metal_init()) {
@@ -325,8 +364,8 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
int orig_cpu_threads = (threads) ? threads : TaskScheduler::max_concurrency();
int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), size_t(0));
- VLOG(1) << "CPU render threads reduced from " << orig_cpu_threads << " to " << cpu_threads
- << ", to dedicate to GPU.";
+ VLOG_INFO << "CPU render threads reduced from " << orig_cpu_threads << " to "
+ << cpu_threads << ", to dedicate to GPU.";
if (cpu_threads >= 1) {
DeviceInfo cpu_device = device;
@@ -338,7 +377,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
}
}
else {
- VLOG(1) << "CPU render threads disabled for interactive render.";
+ VLOG_INFO << "CPU render threads disabled for interactive render.";
continue;
}
}
@@ -380,6 +419,7 @@ void Device::free_memory()
cuda_devices.free_memory();
optix_devices.free_memory();
hip_devices.free_memory();
+ oneapi_devices.free_memory();
cpu_devices.free_memory();
metal_devices.free_memory();
}
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 927caae600c..cdb13ca0a97 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -29,6 +29,7 @@ class DeviceQueue;
class Progress;
class CPUKernels;
class CPUKernelThreadGlobals;
+class Scene;
/* Device Types */
@@ -40,6 +41,7 @@ enum DeviceType {
DEVICE_OPTIX,
DEVICE_HIP,
DEVICE_METAL,
+ DEVICE_ONEAPI,
DEVICE_DUMMY,
};
@@ -49,6 +51,7 @@ enum DeviceTypeMask {
DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
DEVICE_MASK_HIP = (1 << DEVICE_HIP),
DEVICE_MASK_METAL = (1 << DEVICE_METAL),
+ DEVICE_MASK_ONEAPI = (1 << DEVICE_ONEAPI),
DEVICE_MASK_ALL = ~0
};
@@ -184,6 +187,11 @@ class Device {
return 0;
}
+ /* Called after kernel texture setup, and prior to integrator state setup. */
+ virtual void optimize_for_scene(Scene * /*scene*/)
+ {
+ }
+
virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
{
/* Memory is always resident if this is not a multi device, regardless of whether the pointer
@@ -273,6 +281,7 @@ class Device {
static vector<DeviceInfo> cpu_devices;
static vector<DeviceInfo> hip_devices;
static vector<DeviceInfo> metal_devices;
+ static vector<DeviceInfo> oneapi_devices;
static uint devices_initialized_mask;
};
diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp
index d6a5ed9c419..3c9c73e7db0 100644
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -29,30 +29,31 @@ bool device_hip_init()
initialized = true;
int hipew_result = hipewInit(HIPEW_INIT_HIP);
if (hipew_result == HIPEW_SUCCESS) {
- VLOG(1) << "HIPEW initialization succeeded";
+ VLOG_INFO << "HIPEW initialization succeeded";
if (HIPDevice::have_precompiled_kernels()) {
- VLOG(1) << "Found precompiled kernels";
+ VLOG_INFO << "Found precompiled kernels";
result = true;
}
else if (hipewCompilerPath() != NULL) {
- VLOG(1) << "Found HIPCC " << hipewCompilerPath();
+ VLOG_INFO << "Found HIPCC " << hipewCompilerPath();
result = true;
}
else {
- VLOG(1) << "Neither precompiled kernels nor HIPCC was found,"
- << " unable to use HIP";
+ VLOG_INFO << "Neither precompiled kernels nor HIPCC was found,"
+ << " unable to use HIP";
}
}
else {
if (hipew_result == HIPEW_ERROR_ATEXIT_FAILED) {
- VLOG(1) << "HIPEW initialization failed: Error setting up atexit() handler";
+ VLOG_WARNING << "HIPEW initialization failed: Error setting up atexit() handler";
}
else if (hipew_result == HIPEW_ERROR_OLD_DRIVER) {
- VLOG(1) << "HIPEW initialization failed: Driver version too old, requires AMD Radeon Pro "
- "21.Q4 driver or newer";
+ VLOG_WARNING
+ << "HIPEW initialization failed: Driver version too old, requires AMD Radeon Pro "
+ "21.Q4 driver or newer";
}
else {
- VLOG(1) << "HIPEW initialization failed: Error opening HIP dynamic library";
+ VLOG_WARNING << "HIPEW initialization failed: Error opening HIP dynamic library";
}
}
@@ -165,16 +166,16 @@ void device_hip_info(vector<DeviceInfo> &devices)
hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num);
if (timeout_attr && !preempt_attr) {
- VLOG(1) << "Device is recognized as display.";
+ VLOG_INFO << "Device is recognized as display.";
info.description += " (Display)";
info.display_device = true;
display_devices.push_back(info);
}
else {
- VLOG(1) << "Device has compute preemption or is not used for display.";
+ VLOG_INFO << "Device has compute preemption or is not used for display.";
devices.push_back(info);
}
- VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+ VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
}
if (!display_devices.empty())
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
index ea68c821166..a84f1edd70e 100644
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -16,7 +16,6 @@
# include "util/log.h"
# include "util/map.h"
# include "util/md5.h"
-# include "util/opengl.h"
# include "util/path.h"
# include "util/string.h"
# include "util/system.h"
@@ -24,6 +23,8 @@
# include "util/types.h"
# include "util/windows.h"
+# include "kernel/device/hip/globals.h"
+
CCL_NAMESPACE_BEGIN
class HIPDevice;
@@ -52,7 +53,7 @@ void HIPDevice::set_error(const string &error)
}
HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
- : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+ : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
{
first_error = true;
@@ -233,9 +234,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
/* Attempt to use kernel provided with Blender. */
if (!use_adaptive_compilation()) {
const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch));
- VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
+ VLOG_INFO << "Testing for pre-compiled kernel " << fatbin << ".";
if (path_exists(fatbin)) {
- VLOG(1) << "Using precompiled kernel.";
+ VLOG_INFO << "Using precompiled kernel.";
return fatbin;
}
}
@@ -265,9 +266,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
const string include_path = source_path;
const string fatbin_file = string_printf("cycles_%s_%s_%s", name, arch, kernel_md5.c_str());
const string fatbin = path_cache_get(path_join("kernels", fatbin_file));
- VLOG(1) << "Testing for locally compiled kernel " << fatbin << ".";
+ VLOG_INFO << "Testing for locally compiled kernel " << fatbin << ".";
if (path_exists(fatbin)) {
- VLOG(1) << "Using locally compiled kernel.";
+ VLOG_INFO << "Using locally compiled kernel.";
return fatbin;
}
@@ -301,7 +302,7 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
}
const int hipcc_hip_version = hipewCompilerVersion();
- VLOG(1) << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << ".";
+ VLOG_INFO << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << ".";
if (hipcc_hip_version < 40) {
printf(
"Unsupported HIP version %d.%d detected, "
@@ -361,7 +362,7 @@ bool HIPDevice::load_kernels(const uint kernel_features)
*/
if (hipModule) {
if (use_adaptive_compilation()) {
- VLOG(1) << "Skipping HIP kernel reload for adaptive compilation, not currently supported.";
+ VLOG_INFO << "Skipping HIP kernel reload for adaptive compilation, not currently supported.";
}
return true;
}
@@ -444,8 +445,8 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
hipMemGetInfo(&free_after, &total);
}
- VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
- << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+ VLOG_INFO << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+ << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
# if 0
/* For testing mapped host memory, fill up device memory. */
@@ -476,7 +477,7 @@ void HIPDevice::init_host_memory()
}
}
else {
- VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+ VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
@@ -487,8 +488,8 @@ void HIPDevice::init_host_memory()
device_working_headroom = 32 * 1024 * 1024LL; // 32MB
device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
- VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
- << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+ VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+ << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void HIPDevice::load_texture_info()
@@ -556,7 +557,7 @@ void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
* multiple HIP devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
- VLOG(1) << "Move memory from device to host: " << max_mem->name;
+ VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
@@ -658,9 +659,9 @@ HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_pad
}
if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")" << status;
+ VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
@@ -856,8 +857,19 @@ void HIPDevice::const_copy_to(const char *name, void *host, size_t size)
hipDeviceptr_t mem;
size_t bytes;
- hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name));
- hip_assert(hipMemcpyHtoD(mem, host, size));
+ hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, "kernel_params"));
+ assert(bytes == sizeof(KernelParamsHIP));
+
+ /* Update data storage pointers in launch parameters. */
+# define KERNEL_DATA_ARRAY(data_type, data_name) \
+ if (strcmp(name, #data_name) == 0) { \
+ hip_assert(hipMemcpyHtoD(mem + offsetof(KernelParamsHIP, data_name), host, size)); \
+ return; \
+ }
+ KERNEL_DATA_ARRAY(KernelData, data)
+ KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
+# include "kernel/data_arrays.h"
+# undef KERNEL_DATA_ARRAY
}
void HIPDevice::global_alloc(device_memory &mem)
@@ -881,7 +893,6 @@ void HIPDevice::tex_alloc(device_texture &mem)
{
HIPContextScope scope(this);
- string bind_name = mem.name;
size_t dsize = datatype_size(mem.data_type);
size_t size = mem.memory_size();
@@ -966,9 +977,9 @@ void HIPDevice::tex_alloc(device_texture &mem)
desc.NumChannels = mem.data_elements;
desc.Flags = 0;
- VLOG(1) << "Array 3D allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Array 3D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
hip_assert(hipArray3DCreate((hArray *)&array_3d, &desc));
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 6c2c2c29624..8b3d963a32f 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -39,12 +39,12 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
num_states = max((int)(num_states * factor), 1024);
}
else {
- VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
+ VLOG_DEVICE_STATS << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
}
}
- VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
- << string_human_readable_size(num_states * state_size);
+ VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
+ << string_human_readable_size(num_states * state_size);
return num_states;
}
diff --git a/intern/cycles/device/hip/util.h b/intern/cycles/device/hip/util.h
index adb68a2d44c..4e4906171d1 100644
--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@@ -51,7 +51,7 @@ static inline bool hipSupportsDevice(const int hipDevId)
hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
- return (major > 10) || (major == 10 && minor >= 1);
+ return (major >= 9);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h
index 55d6d39cef8..5f44475077e 100644
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -350,7 +350,7 @@ template<typename T> class device_only_memory : public device_memory {
*
* When using memory type MEM_GLOBAL, a pointer to this memory will be
* automatically attached to kernel globals, using the provided name
- * matching an entry in kernel_textures.h. */
+ * matching an entry in kernel/data_arrays.h. */
template<typename T> class device_vector : public device_memory {
public:
diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm
index d7f190fc01e..51e3323370a 100644
--- a/intern/cycles/device/metal/device.mm
+++ b/intern/cycles/device/metal/device.mm
@@ -34,7 +34,8 @@ void device_metal_info(vector<DeviceInfo> &devices)
int device_index = 0;
for (id<MTLDevice> &device : usable_devices) {
/* Compute unique ID for persistent user preferences. */
- string device_name = [device.name UTF8String];
+ string device_name = MetalInfo::get_device_name(device);
+
string id = string("METAL_") + device_name;
/* Hardware ID might not be unique, add device number in that case. */
@@ -48,12 +49,6 @@ void device_metal_info(vector<DeviceInfo> &devices)
info.type = DEVICE_METAL;
info.description = string_remove_trademark(string(device_name));
- /* Ensure unique naming on Apple Silicon / SoC devices which return the same string for CPU and
- * GPU */
- if (info.description == system_cpu_brand_string()) {
- info.description += " (GPU)";
- }
-
info.num = device_index;
/* We don't know if it's used for display, but assume it is. */
info.display_device = true;
@@ -69,14 +64,15 @@ string device_metal_capabilities()
{
string result = "";
auto allDevices = MTLCopyAllDevices();
- uint32_t num_devices = allDevices.count;
+ uint32_t num_devices = (uint32_t)allDevices.count;
if (num_devices == 0) {
return "No Metal devices found\n";
}
result += string_printf("Number of devices: %u\n", num_devices);
for (id<MTLDevice> device in allDevices) {
- result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]);
+ string device_name = MetalInfo::get_device_name(device);
+ result += string_printf("\t\tDevice: %s\n", device_name.c_str());
}
return result;
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index 0e6817d94f8..99e60d3a788 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -42,7 +42,6 @@ class MetalDevice : public Device {
nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
/*---------------------------------------------------*/
- string device_name;
MetalGPUVendor device_vendor;
uint kernel_features;
@@ -76,7 +75,8 @@ class MetalDevice : public Device {
std::vector<id<MTLTexture>> texture_slot_map;
bool use_metalrt = false;
- bool use_function_specialisation = false;
+ MetalPipelineType kernel_specialization_level = PSO_GENERIC;
+ std::atomic_bool async_compile_and_load = false;
virtual BVHLayoutMask get_bvh_layout_mask() const override;
@@ -92,9 +92,7 @@ class MetalDevice : public Device {
bool use_adaptive_compilation();
- string get_source(const uint kernel_features);
-
- string compile_kernel(const uint kernel_features, const char *name);
+ void make_source(MetalPipelineType pso_type, const uint kernel_features);
virtual bool load_kernels(const uint kernel_features) override;
@@ -112,7 +110,9 @@ class MetalDevice : public Device {
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
- id<MTLLibrary> compile(string const &source);
+ virtual void optimize_for_scene(Scene *scene) override;
+
+ bool compile_and_load(MetalPipelineType pso_type);
/* ------------------------------------------------------------------ */
/* low-level memory management */
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 086bf0af979..d1250b83d22 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -6,9 +6,12 @@
# include "device/metal/device_impl.h"
# include "device/metal/device.h"
+# include "scene/scene.h"
+
# include "util/debug.h"
# include "util/md5.h"
# include "util/path.h"
+# include "util/time.h"
CCL_NAMESPACE_BEGIN
@@ -35,7 +38,7 @@ void MetalDevice::set_error(const string &error)
}
MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
- : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+ : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
{
mtlDevId = info.num;
@@ -43,10 +46,9 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
auto usable_devices = MetalInfo::get_usable_devices();
assert(mtlDevId < usable_devices.size());
mtlDevice = usable_devices[mtlDevId];
- device_name = [mtlDevice.name UTF8String];
- device_vendor = MetalInfo::get_vendor_from_device_name(device_name);
+ device_vendor = MetalInfo::get_device_vendor(mtlDevice);
assert(device_vendor != METAL_GPU_UNKNOWN);
- metal_printf("Creating new Cycles device for Metal: %s\n", device_name.c_str());
+ metal_printf("Creating new Cycles device for Metal: %s\n", info.description.c_str());
/* determine default storage mode based on whether UMA is supported */
@@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
case METAL_GPU_APPLE: {
max_threads_per_threadgroup = 512;
use_metalrt = info.use_metalrt;
+
+ /* Specialize the intersection kernels on Apple GPUs by default as these can be built very
+ * quickly. */
+ kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
break;
}
}
@@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
capture_enabled = true;
}
+ if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
+ kernel_specialization_level = (MetalPipelineType)atoi(envstr);
+ }
+ metal_printf("kernel_specialization_level = %s\n",
+ kernel_type_as_string(
+ (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
+
MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
arg_desc_params.dataType = MTLDataTypePointer;
arg_desc_params.access = MTLArgumentAccessReadOnly;
@@ -209,61 +222,86 @@ bool MetalDevice::use_adaptive_compilation()
return DebugFlags().metal.adaptive_compile;
}
-string MetalDevice::get_source(const uint kernel_features)
+void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
{
- string build_options;
-
+ string global_defines;
if (use_adaptive_compilation()) {
- build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+ global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
}
if (use_metalrt) {
- build_options += "-D__METALRT__ ";
+ global_defines += "#define __METALRT__\n";
if (motion_blur) {
- build_options += "-D__METALRT_MOTION__ ";
+ global_defines += "#define __METALRT_MOTION__\n";
}
}
# ifdef WITH_CYCLES_DEBUG
- build_options += "-D__KERNEL_DEBUG__ ";
+ global_defines += "#define __KERNEL_DEBUG__\n";
# endif
switch (device_vendor) {
default:
break;
case METAL_GPU_INTEL:
- build_options += "-D__KERNEL_METAL_INTEL__ ";
+ global_defines += "#define __KERNEL_METAL_INTEL__\n";
break;
case METAL_GPU_AMD:
- build_options += "-D__KERNEL_METAL_AMD__ ";
+ global_defines += "#define __KERNEL_METAL_AMD__\n";
break;
case METAL_GPU_APPLE:
- build_options += "-D__KERNEL_METAL_APPLE__ ";
+ global_defines += "#define __KERNEL_METAL_APPLE__\n";
break;
}
- /* reformat -D defines list into compilable form */
- vector<string> components;
- string_replace(build_options, "-D", "");
- string_split(components, build_options, " ");
+ string &source = this->source[pso_type];
+ source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
+ source = path_source_replace_includes(source, path_get("source"));
- string globalDefines;
- for (const string &component : components) {
- vector<string> assignments;
- string_split(assignments, component, "=");
- if (assignments.size() == 2)
- globalDefines += string_printf(
- "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str());
- else
- globalDefines += string_printf("#define %s\n", assignments[0].c_str());
+ /* Perform any required specialization on the source.
+ * With Metal function constants we can generate a single variant of the kernel source which can
+ * be repeatedly respecialized.
+ */
+ string baked_constants;
+
+ /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
+ * the same character length. Build a string of all active constant values which is then hashed
+ * in order to identify the PSO.
+ */
+ if (pso_type != PSO_GENERIC) {
+ const double starttime = time_dt();
+
+# define KERNEL_STRUCT_BEGIN(name, parent) \
+ string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
+
+ /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */
+# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+ baked_constants += string(#parent "." #name "=") + \
+ to_string(_type(launch_params.data.parent.name)) + "\n";
+
+# include "kernel/data_template.h"
+
+ /* Opt in to all of available specializations. This can be made more granular for the
+ * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
+ * but the overhead should be negligible as these are very quick to (re)build and aren't
+ * serialized to disk via MTLBinaryArchives.
+ */
+ global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
+
+ metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
}
- string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n";
- source = path_source_replace_includes(source, path_get("source"));
-
- metal_printf("Global defines:\n%s\n", globalDefines.c_str());
+ source = global_defines + source;
+ metal_printf("================\n%s================\n\%s================\n",
+ global_defines.c_str(),
+ baked_constants.c_str());
- return source;
+ /* Generate an MD5 from the source and include any baked constants. This is used when caching
+ * PSOs. */
+ MD5Hash md5;
+ md5.append(baked_constants);
+ md5.append(source);
+ source_md5[pso_type] = md5.get_hex();
}
bool MetalDevice::load_kernels(const uint _kernel_features)
@@ -279,24 +317,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
* active, but may still need to be rendered without motion blur if that isn't active as well. */
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
- source[PSO_GENERIC] = get_source(kernel_features);
- mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
-
- MD5Hash md5;
- md5.append(source[PSO_GENERIC]);
- source_md5[PSO_GENERIC] = md5.get_hex();
-
- metal_printf("Front-end compilation finished (generic)\n");
-
- bool result = MetalDeviceKernels::load(this, false);
+ bool result = compile_and_load(PSO_GENERIC);
reserve_local_memory(kernel_features);
-
return result;
}
-id<MTLLibrary> MetalDevice::compile(string const &source)
+bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
{
+ make_source(pso_type, kernel_features);
+
+ if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) {
+ /* We already have a full set of matching pipelines which are cached or queued. */
+ metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type));
+ return true;
+ }
+
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
options.fastMathEnabled = YES;
@@ -304,19 +340,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source)
options.languageVersion = MTLLanguageVersion2_4;
}
+ if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
+ path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
+ source[pso_type]);
+ }
+
+ const double starttime = time_dt();
+
NSError *error = NULL;
- id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
- options:options
- error:&error];
+ mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str())
+ options:options
+ error:&error];
- if (!mtlLibrary) {
+ if (!mtlLibrary[pso_type]) {
NSString *err = [error localizedDescription];
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
}
+ metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
+ time_dt() - starttime,
+ kernel_type_as_string(pso_type));
+
[options release];
- return mtlLibrary;
+ return MetalDeviceKernels::load(this, pso_type);
}
void MetalDevice::reserve_local_memory(const uint kernel_features)
@@ -411,9 +458,9 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
}
if (mem.name) {
- VLOG(2) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
}
mem.device_size = metal_buffer.allocatedSize;
@@ -623,11 +670,63 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
return 0;
}
+void MetalDevice::optimize_for_scene(Scene *scene)
+{
+ MetalPipelineType specialization_level = kernel_specialization_level;
+
+ if (specialization_level < PSO_SPECIALIZED_INTERSECT) {
+ return;
+ }
+
+ /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them
+ * synchronously. */
+ compile_and_load(PSO_SPECIALIZED_INTERSECT);
+
+ if (specialization_level < PSO_SPECIALIZED_SHADE) {
+ return;
+ }
+ if (!scene->params.background) {
+ /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to
+ * build. */
+ return;
+ }
+
+ /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and
+ * only if there isn't an existing load in flight.
+ */
+ auto specialize_shade_fn = ^() {
+ compile_and_load(PSO_SPECIALIZED_SHADE);
+ async_compile_and_load = false;
+ };
+
+ bool async_specialize_shade = true;
+
+ /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
+ if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
+ async_specialize_shade = false;
+ }
+
+ if (async_specialize_shade) {
+ if (!async_compile_and_load) {
+ async_compile_and_load = true;
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
+ specialize_shade_fn);
+ }
+ else {
+ metal_printf(
+ "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n");
+ }
+ }
+ else {
+ specialize_shade_fn();
+ }
+}
+
void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
{
- if (strcmp(name, "__data") == 0) {
+ if (strcmp(name, "data") == 0) {
assert(size == sizeof(KernelData));
- memcpy((uint8_t *)&launch_params + offsetof(KernelParamsMetal, data), host, size);
+ memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData));
return;
}
@@ -646,19 +745,19 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
};
/* Update data storage pointers in launch parameters. */
- if (strcmp(name, "__integrator_state") == 0) {
+ if (strcmp(name, "integrator_state") == 0) {
/* IntegratorStateGPU is contiguous pointers */
- const size_t pointer_block_size = sizeof(IntegratorStateGPU);
+ const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
update_launch_pointers(
- offsetof(KernelParamsMetal, __integrator_state), host, size, pointer_block_size);
+ offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
}
-# define KERNEL_TEX(data_type, tex_name) \
+# define KERNEL_DATA_ARRAY(data_type, tex_name) \
else if (strcmp(name, #tex_name) == 0) \
{ \
update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
}
-# include "kernel/textures.h"
-# undef KERNEL_TEX
+# include "kernel/data_arrays.h"
+# undef KERNEL_DATA_ARRAY
}
void MetalDevice::global_alloc(device_memory &mem)
@@ -800,9 +899,9 @@ void MetalDevice::tex_alloc(device_texture &mem)
desc.textureType = MTLTextureType3D;
desc.depth = mem.data_depth;
- VLOG(2) << "Texture 3D allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Texture 3D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
assert(mtlTexture);
@@ -834,9 +933,9 @@ void MetalDevice::tex_alloc(device_texture &mem)
desc.storageMode = storage_mode;
desc.usage = MTLTextureUsageShaderRead;
- VLOG(2) << "Texture 2D allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Texture 2D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
assert(mtlTexture);
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index 69b2a686ecc..11393f8b7e1 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -31,7 +31,7 @@ enum {
enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
/* Pipeline State Object types */
-enum {
+enum MetalPipelineType {
/* A kernel that can be used with all scenes, supporting all features.
* It is slow to compile, but only needs to be compiled once and is then
* cached for future render sessions. This allows a render to get underway
@@ -39,28 +39,33 @@ enum {
*/
PSO_GENERIC,
- /* A kernel that is relatively quick to compile, but is specialized for the
- * scene being rendered. It only contains the functionality and even baked in
- * constants for values that means it needs to be recompiled whenever a
- * dependent setting is changed. The render performance of this kernel is
- * significantly faster though, and justifies the extra compile time.
+ /* A intersection kernel that is very quick to specialize and results in faster intersection
+ * kernel performance. It uses Metal function constants to replace several KernelData variables
+ * with fixed constants.
+ */
+ PSO_SPECIALIZED_INTERSECT,
+
+ /* A shading kernel that is slow to specialize, but results in faster shading kernel performance
+ * rendered. It uses Metal function constants to replace several KernelData variables with fixed
+ * constants and short-circuit all unused SVM node case handlers.
*/
- /* METAL_WIP: This isn't used and will require more changes to enable. */
- PSO_SPECIALISED,
+ PSO_SPECIALIZED_SHADE,
PSO_NUM
};
-const char *kernel_type_as_string(int kernel_type);
+const char *kernel_type_as_string(MetalPipelineType pso_type);
struct MetalKernelPipeline {
void compile();
id<MTLLibrary> mtlLibrary = nil;
- bool scene_specialized;
+ MetalPipelineType pso_type;
string source_md5;
+ size_t usage_count = 0;
+ KernelData kernel_data_;
bool use_metalrt;
bool metalrt_hair;
bool metalrt_hair_thick;
@@ -75,6 +80,8 @@ struct MetalKernelPipeline {
id<MTLComputePipelineState> pipeline = nil;
int num_threads_per_block = 0;
+ bool should_use_binary_archive() const;
+
string error_str;
API_AVAILABLE(macos(11.0))
@@ -85,7 +92,8 @@ struct MetalKernelPipeline {
/* Cache of Metal kernels for each DeviceKernel. */
namespace MetalDeviceKernels {
-bool load(MetalDevice *device, bool scene_specialized);
+bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type);
+bool load(MetalDevice *device, MetalPipelineType pso_type);
const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);
} /* namespace MetalDeviceKernels */
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index fec4cd80466..385cb412b06 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -5,6 +5,7 @@
# include "device/metal/kernel.h"
# include "device/metal/device_impl.h"
+# include "kernel/device/metal/function_constants.h"
# include "util/md5.h"
# include "util/path.h"
# include "util/tbb.h"
@@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN
/* limit to 2 MTLCompiler instances */
int max_mtlcompiler_threads = 2;
-const char *kernel_type_as_string(int kernel_type)
+const char *kernel_type_as_string(MetalPipelineType pso_type)
{
- switch (kernel_type) {
+ switch (pso_type) {
case PSO_GENERIC:
return "PSO_GENERIC";
- case PSO_SPECIALISED:
- return "PSO_SPECIALISED";
+ case PSO_SPECIALIZED_INTERSECT:
+ return "PSO_SPECIALIZED_INTERSECT";
+ case PSO_SPECIALIZED_SHADE:
+ return "PSO_SPECIALIZED_SHADE";
default:
assert(0);
}
@@ -50,7 +53,11 @@ struct ShaderCache {
/* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
* device. */
- void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
+ void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
+
+ bool should_load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type);
void wait_for_all();
@@ -139,31 +146,34 @@ void ShaderCache::compile_thread_func(int thread_index)
}
}
-void ShaderCache::load_kernel(DeviceKernel device_kernel,
- MetalDevice *device,
- bool scene_specialized)
+bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type)
{
- {
- /* create compiler threads on first run */
- thread_scoped_lock lock(cache_mutex);
- if (compile_threads.empty()) {
- running = true;
- for (int i = 0; i < max_mtlcompiler_threads; i++) {
- compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
- }
- }
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ /* Skip megakernel. */
+ return false;
}
- if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- /* skip megakernel */
- return;
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
+ /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
+ return false;
+ }
}
- if (scene_specialized) {
+ if (pso_type != PSO_GENERIC) {
/* Only specialize kernels where it can make an impact. */
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- return;
+ return false;
+ }
+
+ /* Only specialize shading / intersection kernels as requested. */
+ bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+ bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
+ if (is_shade_pso != is_shade_kernel) {
+ return false;
}
}
@@ -171,35 +181,45 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
/* check whether the kernel has already been requested / cached */
thread_scoped_lock lock(cache_mutex);
for (auto &pipeline : pipelines[device_kernel]) {
- if (scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
- /* we already requested a pipeline that is specialized for this kernel data */
- metal_printf("Specialized kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
+ if (pipeline->source_md5 == device->source_md5[pso_type]) {
+ return false;
}
- else {
- if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
- /* we already requested a generic pipeline for this kernel */
- metal_printf("Generic kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
+ }
+ }
+
+ return true;
+}
+
+void ShaderCache::load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type)
+{
+ {
+ /* create compiler threads on first run */
+ thread_scoped_lock lock(cache_mutex);
+ if (compile_threads.empty()) {
+ running = true;
+ for (int i = 0; i < max_mtlcompiler_threads; i++) {
+ compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
}
}
}
+ if (!should_load_kernel(device_kernel, device, pso_type)) {
+ return;
+ }
+
incomplete_requests++;
PipelineRequest request;
request.pipeline = new MetalKernelPipeline;
- request.pipeline->scene_specialized = scene_specialized;
+ memcpy(&request.pipeline->kernel_data_,
+ &device->launch_params.data,
+ sizeof(request.pipeline->kernel_data_));
+ request.pipeline->pso_type = pso_type;
request.pipeline->mtlDevice = mtlDevice;
- request.pipeline->source_md5 =
- device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
- request.pipeline->mtlLibrary =
- device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
+ request.pipeline->source_md5 = device->source_md5[pso_type];
+ request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
request.pipeline->device_kernel = device_kernel;
request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
@@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
{
thread_scoped_lock lock(cache_mutex);
- pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
+ auto &collection = pipelines[device_kernel];
+
+ /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
+ int max_entries_of_same_pso_type = 3;
+ for (int i = (int)collection.size() - 1; i >= 0; i--) {
+ if (collection[i]->pso_type == pso_type) {
+ max_entries_of_same_pso_type -= 1;
+ if (max_entries_of_same_pso_type == 0) {
+ metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
+ kernel_type_as_string(pso_type),
+ device_kernel_as_string(device_kernel));
+ collection.erase(collection.begin() + i);
+ break;
+ }
+ }
+ }
+
+ collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
request_queue.push_back(request);
}
cond_var.notify_one();
@@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
continue;
}
- if (pipeline->scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
+ if (pipeline->pso_type != PSO_GENERIC) {
+ if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
+ pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
best_pipeline = pipeline.get();
}
}
@@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
}
}
+ if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
+ metal_printf("Swapping in %s version of %s\n",
+ kernel_type_as_string(best_pipeline->pso_type),
+ device_kernel_as_string(kernel));
+ }
+ best_pipeline->usage_count += 1;
+
return best_pipeline;
}
-void MetalKernelPipeline::compile()
+bool MetalKernelPipeline::should_use_binary_archive() const
{
- int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
+ if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+ if (atoi(str) != 0) {
+ /* Don't archive if we have opted out by env var. */
+ return false;
+ }
+ }
+
+ if (pso_type == PSO_GENERIC) {
+ /* Archive the generic kernels. */
+ return true;
+ }
+
+ if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
+ device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+ /* Archive all shade kernels - they take a long time to compile. */
+ return true;
+ }
+
+ /* The remaining kernels are all fast to compile. They may get cached by the system shader cache,
+ * but will be quick to regenerate if not. */
+ return false;
+}
+
+static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr)
+{
+ MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
+
+ MTLDataType MTLDataType_int = MTLDataTypeInt;
+ MTLDataType MTLDataType_float = MTLDataTypeFloat;
+ MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
+ KernelData zero_data = {0};
+ if (!data) {
+ data = &zero_data;
+ }
+# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+ [constant_values setConstantValue:&data->parent.name \
+ type:MTLDataType_##_type \
+ atIndex:KernelData_##parent##_##name];
+
+# include "kernel/data_template.h"
+
+ return constant_values;
+}
+
+void MetalKernelPipeline::compile()
+{
const std::string function_name = std::string("cycles_metal_") +
device_kernel_as_string(device_kernel);
@@ -281,6 +371,17 @@ void MetalKernelPipeline::compile()
if (@available(macOS 11.0, *)) {
MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
func_desc.name = entryPoint;
+
+ if (pso_type == PSO_SPECIALIZED_SHADE) {
+ func_desc.constantValues = GetConstantValues(&kernel_data_);
+ }
+ else if (pso_type == PSO_SPECIALIZED_INTERSECT) {
+ func_desc.constantValues = GetConstantValues(&kernel_data_);
+ }
+ else {
+ func_desc.constantValues = GetConstantValues();
+ }
+
function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
}
@@ -427,10 +528,7 @@ void MetalKernelPipeline::compile()
MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
- bool use_binary_archive = true;
- if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
- use_binary_archive = (atoi(str) == 0);
- }
+ bool use_binary_archive = should_use_binary_archive();
id<MTLBinaryArchive> archive = nil;
string metalbin_path;
@@ -608,19 +706,32 @@ void MetalKernelPipeline::compile()
}
}
-bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
+bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
{
+ const double starttime = time_dt();
auto shader_cache = get_shader_cache(device->mtlDevice);
for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
- shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
+ shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
}
- if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
- shader_cache->wait_for_all();
- }
+ shader_cache->wait_for_all();
+ metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+ time_dt() - starttime,
+ kernel_type_as_string(pso_type));
return true;
}
+bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
+{
+ auto shader_cache = get_shader_cache(device->mtlDevice);
+ for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
+ if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
+ return true;
+ }
+ }
+ return false;
+}
+
const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
DeviceKernel kernel)
{
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index b0bd487c86d..fc32740f3e1 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t) const override;
virtual int num_concurrent_busy_states() const override;
+ virtual int num_sort_partition_elements() const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 0e260886abb..5ac63a16c61 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -293,6 +293,11 @@ int MetalDeviceQueue::num_concurrent_busy_states() const
return result;
}
+int MetalDeviceQueue::num_sort_partition_elements() const
+{
+ return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
+}
+
void MetalDeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
@@ -311,8 +316,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
return false;
}
- VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
- << work_size;
+ VLOG_DEVICE_STATS << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
+ << work_size;
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
@@ -358,8 +363,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
/* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
- size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, __integrator_state) +
- sizeof(IntegratorStateGPU);
+ size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
+ offsetof(IntegratorStateGPU, sort_partition_divisor);
size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
(uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
@@ -415,8 +420,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
/* this relies on IntegratorStateGPU layout being contiguous device_ptrs */
- const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) +
- sizeof(IntegratorStateGPU);
+ const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
+ offsetof(IntegratorStateGPU, sort_partition_divisor);
for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
int pointer_index = int(offset / sizeof(device_ptr));
MetalDevice::MetalMem *mmem = *(
@@ -550,7 +555,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* Enhanced command buffer errors are only available in 11.0+ */
if (@available(macos 11.0, *)) {
if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) {
- printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
+ metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]);
NSArray<id<MTLCommandBufferEncoderInfo>> *encoderInfos = [command_buffer.error.userInfo
valueForKey:MTLCommandBufferEncoderInfoErrorKey];
if (encoderInfos != nil) {
@@ -564,7 +569,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
}
else if (command_buffer.error) {
- printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
+ metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]);
}
}
}];
diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h
index f728967835d..a988d01d361 100644
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@@ -25,10 +25,20 @@ enum MetalGPUVendor {
METAL_GPU_INTEL = 3,
};
+enum AppleGPUArchitecture {
+ APPLE_UNKNOWN,
+ APPLE_M1,
+ APPLE_M2,
+};
+
/* Contains static Metal helper functions. */
struct MetalInfo {
static vector<id<MTLDevice>> const &get_usable_devices();
- static MetalGPUVendor get_vendor_from_device_name(string const &device_name);
+ static int get_apple_gpu_core_count(id<MTLDevice> device);
+ static MetalGPUVendor get_device_vendor(id<MTLDevice> device);
+ static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device);
+ static int optimal_sort_partition_elements(id<MTLDevice> device);
+ static string get_device_name(id<MTLDevice> device);
};
/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index a6bd593bcb6..65c67c400fe 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -10,26 +10,83 @@
# include "util/string.h"
# include "util/time.h"
+# include <IOKit/IOKitLib.h>
# include <pwd.h>
# include <sys/shm.h>
# include <time.h>
CCL_NAMESPACE_BEGIN
-MetalGPUVendor MetalInfo::get_vendor_from_device_name(string const &device_name)
+string MetalInfo::get_device_name(id<MTLDevice> device)
{
- if (device_name.find("Intel") != string::npos) {
+ string device_name = [device.name UTF8String];
+ if (get_device_vendor(device) == METAL_GPU_APPLE) {
+ /* Append the GPU core count so we can distinguish between GPU variants in benchmarks. */
+ int gpu_core_count = get_apple_gpu_core_count(device);
+ device_name += string_printf(gpu_core_count ? " (GPU - %d cores)" : " (GPU)", gpu_core_count);
+ }
+ return device_name;
+}
+
+int MetalInfo::get_apple_gpu_core_count(id<MTLDevice> device)
+{
+ int core_count = 0;
+ if (@available(macos 12.0, *)) {
+ io_service_t gpu_service = IOServiceGetMatchingService(
+ kIOMainPortDefault, IORegistryEntryIDMatching(device.registryID));
+ if (CFNumberRef numberRef = (CFNumberRef)IORegistryEntryCreateCFProperty(
+ gpu_service, CFSTR("gpu-core-count"), 0, 0)) {
+ if (CFGetTypeID(numberRef) == CFNumberGetTypeID()) {
+ CFNumberGetValue(numberRef, kCFNumberSInt32Type, &core_count);
+ }
+ CFRelease(numberRef);
+ }
+ }
+ return core_count;
+}
+
+AppleGPUArchitecture MetalInfo::get_apple_gpu_architecture(id<MTLDevice> device)
+{
+ const char *device_name = [device.name UTF8String];
+ if (strstr(device_name, "M1")) {
+ return APPLE_M1;
+ }
+ else if (strstr(device_name, "M2")) {
+ return APPLE_M2;
+ }
+ return APPLE_UNKNOWN;
+}
+
+MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
+{
+ const char *device_name = [device.name UTF8String];
+ if (strstr(device_name, "Intel")) {
return METAL_GPU_INTEL;
}
- else if (device_name.find("AMD") != string::npos) {
+ else if (strstr(device_name, "AMD")) {
return METAL_GPU_AMD;
}
- else if (device_name.find("Apple") != string::npos) {
+ else if (strstr(device_name, "Apple")) {
return METAL_GPU_APPLE;
}
return METAL_GPU_UNKNOWN;
}
+int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
+{
+ if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) {
+ return atoi(str);
+ }
+
+ /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before
+ * sorting each partition by material. Partitioning into chunks of 65536 elements results in an
+ * overall render time speedup of up to 15%. */
+ if (get_device_vendor(device) == METAL_GPU_APPLE) {
+ return 65536;
+ }
+ return 0;
+}
+
vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
{
static vector<id<MTLDevice>> usable_devices;
@@ -41,9 +98,8 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
metal_printf("Usable Metal devices:\n");
for (id<MTLDevice> device in MTLCopyAllDevices()) {
- const char *device_name = [device.name UTF8String];
-
- MetalGPUVendor vendor = get_vendor_from_device_name(device_name);
+ string device_name = get_device_name(device);
+ MetalGPUVendor vendor = get_device_vendor(device);
bool usable = false;
if (@available(macos 12.2, *)) {
@@ -55,12 +111,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
}
if (usable) {
- metal_printf("- %s\n", device_name);
+ metal_printf("- %s\n", device_name.c_str());
[device retain];
usable_devices.push_back(device);
}
else {
- metal_printf(" (skipping \"%s\")\n", device_name);
+ metal_printf(" (skipping \"%s\")\n", device_name.c_str());
}
}
if (usable_devices.empty()) {
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
new file mode 100644
index 00000000000..8056c204188
--- /dev/null
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#include "device/oneapi/device.h"
+
+#include "util/log.h"
+
+#ifdef WITH_ONEAPI
+# include "device/device.h"
+# include "device/oneapi/device_impl.h"
+
+# include "util/path.h"
+# include "util/string.h"
+
+# ifdef __linux__
+# include <dlfcn.h>
+# endif
+#endif /* WITH_ONEAPI */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_ONEAPI
+static OneAPIDLLInterface oneapi_dll;
+#endif
+
+#ifdef _WIN32
+# define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path))
+# define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle)
+# define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name)
+#elif __linux__
+# define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW)
+# define FREE_SHARED_LIBRARY(handle) dlclose(handle)
+# define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name)
+#endif
+
+bool device_oneapi_init()
+{
+#if !defined(WITH_ONEAPI)
+ return false;
+#else
+
+ string lib_path = path_get("lib");
+# ifdef _WIN32
+ lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll");
+# else
+ lib_path = path_join(lib_path, "cycles_kernel_oneapi.so");
+# endif
+ void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str());
+
+ /* This shouldn't happen, but it still makes sense to have a branch for this. */
+ if (lib_handle == NULL) {
+ LOG(ERROR) << "oneAPI kernel shared library cannot be loaded for some reason. This should not "
+ "happen, however, it occurs hence oneAPI rendering will be disabled";
+ return false;
+ }
+
+# define DLL_INTERFACE_CALL(function, return_type, ...) \
+ (oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \
+ GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \
+ if (oneapi_dll.function == NULL) { \
+ LOG(ERROR) << "oneAPI shared library function \"" << #function \
+ << "\" has not been loaded from kernel shared - disable oneAPI " \
+ "library disable oneAPI implementation due to this"; \
+ FREE_SHARED_LIBRARY(lib_handle); \
+ return false; \
+ }
+# include "kernel/device/oneapi/dll_interface_template.h"
+# undef DLL_INTERFACE_CALL
+
+ VLOG_INFO << "oneAPI kernel shared library has been loaded successfully";
+
+ /* We need to have this oneapi kernel shared library during all life-span of the Blender.
+ * So it is not unloaded because of this.
+ * FREE_SHARED_LIBRARY(lib_handle); */
+
+ /* NOTE(@nsirgien): we need to enable JIT cache from here and
+ * right now this cache policy is controlled by env. variables. */
+ /* NOTE(hallade) we also disable use of copy engine as it
+ * improves stability as of intel/LLVM SYCL-nightly/20220529.
+ * All these env variable can be set beforehand by end-users and
+ * will in that case -not- be overwritten. */
+# ifdef _WIN32
+ if (getenv("SYCL_CACHE_PERSISTENT") == nullptr) {
+ _putenv_s("SYCL_CACHE_PERSISTENT", "1");
+ }
+ if (getenv("SYCL_CACHE_TRESHOLD") == nullptr) {
+ _putenv_s("SYCL_CACHE_THRESHOLD", "0");
+ }
+ if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
+ _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
+ }
+ if (getenv("SYCL_ENABLE_PCI") == nullptr) {
+ _putenv_s("SYCL_ENABLE_PCI", "1");
+ }
+ if (getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE") == nullptr) {
+ _putenv_s("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0");
+ }
+# elif __linux__
+ setenv("SYCL_CACHE_PERSISTENT", "1", false);
+ setenv("SYCL_CACHE_THRESHOLD", "0", false);
+ setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
+ setenv("SYCL_ENABLE_PCI", "1", false);
+ setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
+# endif
+
+ return true;
+#endif
+}
+
+#if defined(_WIN32) || defined(__linux__)
+# undef LOAD_SYCL_SHARED_LIBRARY
+# undef LOAD_ONEAPI_SHARED_LIBRARY
+# undef FREE_SHARED_LIBRARY
+# undef GET_SHARED_LIBRARY_SYMBOL
+#endif
+
+Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_ONEAPI
+ return new OneapiDevice(info, oneapi_dll, stats, profiler);
+#else
+ (void)info;
+ (void)stats;
+ (void)profiler;
+
+ LOG(FATAL) << "Requested to create oneAPI device while not enabled for this build.";
+
+ return nullptr;
+#endif
+}
+
+#ifdef WITH_ONEAPI
+static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr)
+{
+ vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr;
+
+ DeviceInfo info;
+
+ info.type = DEVICE_ONEAPI;
+ info.description = name;
+ info.num = num;
+
+ /* NOTE(@nsirgien): Should be unique at least on proper oneapi installation. */
+ info.id = id;
+
+ info.has_nanovdb = true;
+ info.denoisers = 0;
+
+ info.has_gpu_queue = true;
+
+ /* NOTE(@nsirgien): oneAPI right now is focused on one device usage. In future it maybe will
+ * change, but right now peer access from one device to another device is not supported. */
+ info.has_peer_memory = false;
+
+ /* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */
+ info.display_device = false;
+
+ devices->push_back(info);
+ VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+}
+#endif
+
+void device_oneapi_info(vector<DeviceInfo> &devices)
+{
+#ifdef WITH_ONEAPI
+ (oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices);
+#else /* WITH_ONEAPI */
+ (void)devices;
+#endif /* WITH_ONEAPI */
+}
+
+string device_oneapi_capabilities()
+{
+ string capabilities;
+#ifdef WITH_ONEAPI
+ char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)();
+ if (c_capabilities) {
+ capabilities = c_capabilities;
+ (oneapi_dll.oneapi_free)(c_capabilities);
+ }
+#endif
+ return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/oneapi/device.h b/intern/cycles/device/oneapi/device.h
new file mode 100644
index 00000000000..db8c985d4d5
--- /dev/null
+++ b/intern/cycles/device/oneapi/device.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "util/string.h"
+#include "util/vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_oneapi_init();
+
+Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_oneapi_info(vector<DeviceInfo> &devices);
+
+string device_oneapi_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
new file mode 100644
index 00000000000..dd0622a5bd5
--- /dev/null
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -0,0 +1,446 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+# include "device/oneapi/device_impl.h"
+
+# include "util/debug.h"
+# include "util/log.h"
+
+# include "kernel/device/oneapi/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+static void queue_error_cb(const char *message, void *user_ptr)
+{
+ if (user_ptr) {
+ *reinterpret_cast<std::string *>(user_ptr) = message;
+ }
+}
+
+OneapiDevice::OneapiDevice(const DeviceInfo &info,
+ OneAPIDLLInterface &oneapi_dll_object,
+ Stats &stats,
+ Profiler &profiler)
+ : Device(info, stats, profiler),
+ device_queue_(nullptr),
+ texture_info_(this, "texture_info", MEM_GLOBAL),
+ kg_memory_(nullptr),
+ kg_memory_device_(nullptr),
+ kg_memory_size_(0),
+ oneapi_dll_(oneapi_dll_object)
+{
+ need_texture_info_ = false;
+
+ oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
+
+ /* OneAPI calls should be initialized on this moment. */
+ assert(oneapi_dll_.oneapi_create_queue != nullptr);
+
+ bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num);
+ if (is_finished_ok == false) {
+ set_error("oneAPI queue initialization error: got runtime exception \"" +
+ oneapi_error_string_ + "\"");
+ }
+ else {
+ VLOG_DEBUG << "oneAPI queue has been successfully created for the device \""
+ << info.description << "\"";
+ assert(device_queue_);
+ }
+
+ size_t globals_segment_size;
+ is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size);
+ if (is_finished_ok == false) {
+ set_error("oneAPI constant memory initialization got runtime exception \"" +
+ oneapi_error_string_ + "\"");
+ }
+ else {
+ VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
+ }
+
+ kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
+ oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
+
+ kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size);
+
+ kg_memory_size_ = globals_segment_size;
+
+ max_memory_on_device_ = oneapi_dll_.oneapi_get_memcapacity(device_queue_);
+}
+
+OneapiDevice::~OneapiDevice()
+{
+ texture_info_.free();
+ oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_);
+ oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_);
+
+ for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++)
+ delete mt->second;
+
+ if (device_queue_)
+ oneapi_dll_.oneapi_free_queue(device_queue_);
+}
+
+bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
+{
+ return false;
+}
+
+BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
+{
+ return BVH_LAYOUT_BVH2;
+}
+
+bool OneapiDevice::load_kernels(const uint requested_features)
+{
+ assert(device_queue_);
+ /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with certain feature set
+ * with specialization constants, but it hasn't been implemented yet. */
+ (void)requested_features;
+
+ bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_);
+ if (is_finished_ok == false) {
+ set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
+ }
+ else {
+ VLOG_INFO << "Runtime compilation done for \"" << info.description << "\"";
+ assert(device_queue_);
+ }
+ return is_finished_ok;
+}
+
+void OneapiDevice::load_texture_info()
+{
+ if (need_texture_info_) {
+ need_texture_info_ = false;
+ texture_info_.copy_to_device();
+ }
+}
+
+void OneapiDevice::generic_alloc(device_memory &mem)
+{
+ size_t memory_size = mem.memory_size();
+
+ /* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then
+ * we can use USM host memory.
+ * Because of the expected performance impact, implementation of this has had a low priority
+ * and is not implemented yet. */
+
+ assert(device_queue_);
+ /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
+ * and shared. For new project it maybe more beneficial to use USM shared memory, because it
+ * provides automatic migration mechanism in order to allow to use the same pointer on host and
+ * on device, without need to worry about explicit memory transfer operations. But for
+ * Blender/Cycles this type of memory is not very suitable in current application architecture,
+ * because Cycles already uses two different pointer for host activity and device activity, and
+ * also has to perform all needed memory transfer operations. So, USM device memory
+ * type has been used for oneAPI device in order to better fit in Cycles architecture. */
+ void *device_pointer = nullptr;
+ if (mem.memory_size() + stats.mem_used < max_memory_on_device_)
+ device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size);
+ if (device_pointer == nullptr) {
+ set_error("oneAPI kernel - device memory allocation error for " +
+ string_human_readable_size(mem.memory_size()) +
+ ", possibly caused by lack of available memory space on the device: " +
+ string_human_readable_size(stats.mem_used) + " of " +
+ string_human_readable_size(max_memory_on_device_) + " is already allocated");
+ }
+
+ mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer);
+ mem.device_size = memory_size;
+
+ stats.mem_alloc(memory_size);
+}
+
+void OneapiDevice::generic_copy_to(device_memory &mem)
+{
+ if (!mem.device_pointer) {
+ return;
+ }
+ size_t memory_size = mem.memory_size();
+
+ /* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
+ assert(mem.host_pointer);
+ assert(device_queue_);
+ oneapi_dll_.oneapi_usm_memcpy(
+ device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
+}
+
+/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
+SyclQueue *OneapiDevice::sycl_queue()
+{
+ return device_queue_;
+}
+
+string OneapiDevice::oneapi_error_message()
+{
+ return string(oneapi_error_string_);
+}
+
+OneAPIDLLInterface OneapiDevice::oneapi_dll_object()
+{
+ return oneapi_dll_;
+}
+
+void *OneapiDevice::kernel_globals_device_pointer()
+{
+ return kg_memory_device_;
+}
+
+void OneapiDevice::generic_free(device_memory &mem)
+{
+ if (!mem.device_pointer) {
+ return;
+ }
+
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+
+ assert(device_queue_);
+ oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer);
+ mem.device_pointer = 0;
+}
+
+void OneapiDevice::mem_alloc(device_memory &mem)
+{
+ if (mem.type == MEM_TEXTURE) {
+ assert(!"mem_alloc not supported for textures.");
+ }
+ else if (mem.type == MEM_GLOBAL) {
+ assert(!"mem_alloc not supported for global memory.");
+ }
+ else {
+ if (mem.name) {
+ VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+ generic_alloc(mem);
+ }
+}
+
+void OneapiDevice::mem_copy_to(device_memory &mem)
+{
+ if (mem.name) {
+ VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ global_alloc(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ tex_alloc((device_texture &)mem);
+ }
+ else {
+ if (!mem.device_pointer)
+ mem_alloc(mem);
+
+ generic_copy_to(mem);
+ }
+}
+
+void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
+{
+ if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+ assert(!"mem_copy_from not supported for textures.");
+ }
+ else if (mem.host_pointer) {
+ const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size();
+ const size_t offset = elem * y * w;
+
+ if (mem.name) {
+ VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ") from offset " << offset
+ << " data " << size << " bytes";
+ }
+
+ assert(device_queue_);
+
+ assert(size != 0);
+ if (mem.device_pointer) {
+ char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
+ char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
+ bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy(
+ device_queue_, shifted_host, shifted_device, size);
+ if (is_finished_ok == false) {
+ set_error("oneAPI memory operation error: got runtime exception \"" +
+ oneapi_error_string_ + "\"");
+ }
+ }
+ }
+}
+
+void OneapiDevice::mem_zero(device_memory &mem)
+{
+ if (mem.name) {
+ VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")\n";
+ }
+
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+ if (!mem.device_pointer) {
+ return;
+ }
+
+ assert(device_queue_);
+ bool is_finished_ok = oneapi_dll_.oneapi_usm_memset(
+ device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
+ if (is_finished_ok == false) {
+ set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
+ "\"");
+ }
+}
+
+void OneapiDevice::mem_free(device_memory &mem)
+{
+ if (mem.name) {
+ VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", "
+ << string_human_readable_number(mem.device_size) << " bytes. ("
+ << string_human_readable_size(mem.device_size) << ")\n";
+ }
+
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ }
+ else {
+ generic_free(mem);
+ }
+}
+
+device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
+{
+ return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) +
+ mem.memory_elements_size(offset));
+}
+
+void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+ assert(name);
+
+ VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object "
+ << string_human_readable_number(size) << " bytes. ("
+ << string_human_readable_size(size) << ")";
+
+ ConstMemMap::iterator i = const_mem_map_.find(name);
+ device_vector<uchar> *data;
+
+ if (i == const_mem_map_.end()) {
+ data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
+ data->alloc(size);
+ const_mem_map_.insert(ConstMemMap::value_type(name, data));
+ }
+ else {
+ data = i->second;
+ }
+
+ assert(data->memory_size() <= size);
+ memcpy(data->data(), host, size);
+ data->copy_to_device();
+
+ oneapi_dll_.oneapi_set_global_memory(
+ device_queue_, kg_memory_, name, (void *)data->device_pointer);
+
+ oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+}
+
+void OneapiDevice::global_alloc(device_memory &mem)
+{
+ assert(mem.name);
+
+ size_t size = mem.memory_size();
+ VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object "
+ << string_human_readable_number(size) << " bytes. ("
+ << string_human_readable_size(size) << ")";
+
+ generic_alloc(mem);
+ generic_copy_to(mem);
+
+ oneapi_dll_.oneapi_set_global_memory(
+ device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
+
+ oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+}
+
+void OneapiDevice::global_free(device_memory &mem)
+{
+ if (mem.device_pointer) {
+ generic_free(mem);
+ }
+}
+
+void OneapiDevice::tex_alloc(device_texture &mem)
+{
+ generic_alloc(mem);
+ generic_copy_to(mem);
+
+ /* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */
+ const uint slot = mem.slot;
+ if (slot >= texture_info_.size()) {
+ texture_info_.resize(slot + 128);
+ }
+
+ texture_info_[slot] = mem.info;
+ need_texture_info_ = true;
+
+ texture_info_[slot].data = (uint64_t)mem.device_pointer;
+}
+
+void OneapiDevice::tex_free(device_texture &mem)
+{
+ /* There is no texture memory in SYCL. */
+ if (mem.device_pointer) {
+ generic_free(mem);
+ }
+}
+
+unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
+{
+ return make_unique<OneapiDeviceQueue>(this);
+}
+
+int OneapiDevice::get_num_multiprocessors()
+{
+ assert(device_queue_);
+ return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_);
+}
+
+int OneapiDevice::get_max_num_threads_per_multiprocessor()
+{
+ assert(device_queue_);
+ return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_);
+}
+
+bool OneapiDevice::should_use_graphics_interop()
+{
+ /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
+ * return false. */
+ return false;
+}
+
+void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment)
+{
+ assert(device_queue_);
+ return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment);
+}
+
+void OneapiDevice::usm_free(void *usm_ptr)
+{
+ assert(device_queue_);
+ return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
new file mode 100644
index 00000000000..6abebf98684
--- /dev/null
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+# include "device/device.h"
+# include "device/oneapi/device.h"
+# include "device/oneapi/queue.h"
+
+# include "util/map.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class OneapiDevice : public Device {
+ private:
+ SyclQueue *device_queue_;
+
+ using ConstMemMap = map<string, device_vector<uchar> *>;
+ ConstMemMap const_mem_map_;
+ device_vector<TextureInfo> texture_info_;
+ bool need_texture_info_;
+ void *kg_memory_;
+ void *kg_memory_device_;
+ size_t kg_memory_size_ = (size_t)0;
+ size_t max_memory_on_device_ = (size_t)0;
+ OneAPIDLLInterface oneapi_dll_;
+ std::string oneapi_error_string_;
+
+ public:
+ virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+ OneapiDevice(const DeviceInfo &info,
+ OneAPIDLLInterface &oneapi_dll_object,
+ Stats &stats,
+ Profiler &profiler);
+
+ virtual ~OneapiDevice();
+
+ bool check_peer_access(Device *peer_device) override;
+
+ bool load_kernels(const uint requested_features) override;
+
+ void load_texture_info();
+
+ void generic_alloc(device_memory &mem);
+
+ void generic_copy_to(device_memory &mem);
+
+ void generic_free(device_memory &mem);
+
+ SyclQueue *sycl_queue();
+
+ string oneapi_error_message();
+
+ OneAPIDLLInterface oneapi_dll_object();
+
+ void *kernel_globals_device_pointer();
+
+ void mem_alloc(device_memory &mem) override;
+
+ void mem_copy_to(device_memory &mem) override;
+
+ void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
+
+ void mem_copy_from(device_memory &mem)
+ {
+ mem_copy_from(mem, 0, 0, 0, 0);
+ }
+
+ void mem_zero(device_memory &mem) override;
+
+ void mem_free(device_memory &mem) override;
+
+ device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
+
+ virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+ void global_alloc(device_memory &mem);
+
+ void global_free(device_memory &mem);
+
+ void tex_alloc(device_texture &mem);
+
+ void tex_free(device_texture &mem);
+
+ /* Graphics resources interoperability. */
+ virtual bool should_use_graphics_interop() override;
+
+ virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+ int get_num_multiprocessors();
+ int get_max_num_threads_per_multiprocessor();
+
+ /* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
+ * side compilation (MSVC). */
+ void *usm_aligned_alloc_host(size_t memory_size, size_t alignment);
+ void usm_free(void *usm_ptr);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/oneapi/dll_interface.h b/intern/cycles/device/oneapi/dll_interface.h
new file mode 100644
index 00000000000..0a888194e98
--- /dev/null
+++ b/intern/cycles/device/oneapi/dll_interface.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+/* Include kernel header to get access to SYCL-specific types, like SyclQueue and
+ * OneAPIDeviceIteratorCallback. */
+#include "kernel/device/oneapi/kernel.h"
+
+#ifdef WITH_ONEAPI
+struct OneAPIDLLInterface {
+# define DLL_INTERFACE_CALL(function, return_type, ...) \
+ return_type (*function)(__VA_ARGS__) = nullptr;
+# include "kernel/device/oneapi/dll_interface_template.h"
+# undef DLL_INTERFACE_CALL
+};
+#endif
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
new file mode 100644
index 00000000000..1e822e25f1a
--- /dev/null
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+# include "device/oneapi/queue.h"
+# include "device/oneapi/device_impl.h"
+# include "util/log.h"
+# include "util/time.h"
+# include <iomanip>
+# include <vector>
+
+# include "kernel/device/oneapi/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelExecutionInfo {
+ double elapsed_summary = 0.0;
+ int enqueue_count = 0;
+};
+
+/* OneapiDeviceQueue */
+
+OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device)
+ : DeviceQueue(device),
+ oneapi_device_(device),
+ oneapi_dll_(device->oneapi_dll_object()),
+ kernel_context_(nullptr)
+{
+}
+
+OneapiDeviceQueue::~OneapiDeviceQueue()
+{
+ delete kernel_context_;
+}
+
+int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+ const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
+ oneapi_device_->get_max_num_threads_per_multiprocessor();
+ int num_states = max(8 * max_num_threads, 65536) * 16;
+
+ VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
+ << string_human_readable_size(num_states * state_size);
+
+ return num_states;
+}
+
+int OneapiDeviceQueue::num_concurrent_busy_states() const
+{
+ const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
+ oneapi_device_->get_max_num_threads_per_multiprocessor();
+
+ return 4 * max(8 * max_num_threads, 65536);
+}
+
+void OneapiDeviceQueue::init_execution()
+{
+ oneapi_device_->load_texture_info();
+
+ SyclQueue *device_queue = oneapi_device_->sycl_queue();
+ void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer();
+ assert(device_queue);
+ assert(kg_dptr);
+ kernel_context_ = new KernelContext{device_queue, kg_dptr};
+
+ debug_init_execution();
+}
+
+bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
+ const int signed_kernel_work_size,
+ DeviceKernelArguments const &_args)
+{
+ if (oneapi_device_->have_error()) {
+ return false;
+ }
+
+ void **args = const_cast<void **>(_args.values);
+
+ debug_enqueue(kernel, signed_kernel_work_size);
+ assert(signed_kernel_work_size >= 0);
+ size_t kernel_work_size = (size_t)signed_kernel_work_size;
+
+ size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size(
+ kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
+ size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
+
+ assert(kernel_context_);
+
+ /* Call the oneAPI kernel DLL to launch the requested kernel. */
+ bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel(
+ kernel_context_, kernel, uniformed_kernel_work_size, args);
+
+ if (is_finished_ok == false) {
+ oneapi_device_->set_error("oneAPI kernel \"" + std::string(device_kernel_as_string(kernel)) +
+ "\" execution error: got runtime exception \"" +
+ oneapi_device_->oneapi_error_message() + "\"");
+ }
+
+ return is_finished_ok;
+}
+
+bool OneapiDeviceQueue::synchronize()
+{
+ if (oneapi_device_->have_error()) {
+ return false;
+ }
+
+ bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue());
+ if (is_finished_ok == false)
+ oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" +
+ oneapi_device_->oneapi_error_message() + "\"");
+
+ debug_synchronize();
+
+ return !(oneapi_device_->have_error());
+}
+
+void OneapiDeviceQueue::zero_to_device(device_memory &mem)
+{
+ oneapi_device_->mem_zero(mem);
+}
+
+void OneapiDeviceQueue::copy_to_device(device_memory &mem)
+{
+ oneapi_device_->mem_copy_to(mem);
+}
+
+void OneapiDeviceQueue::copy_from_device(device_memory &mem)
+{
+ oneapi_device_->mem_copy_from(mem);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
new file mode 100644
index 00000000000..716cbfdc88c
--- /dev/null
+++ b/intern/cycles/device/oneapi/queue.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#ifdef WITH_ONEAPI
+
+# include "device/kernel.h"
+# include "device/memory.h"
+# include "device/queue.h"
+
+# include "device/oneapi/device.h"
+# include "device/oneapi/dll_interface.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OneapiDevice;
+class device_memory;
+
+/* Base class for OneAPI queues. */
+class OneapiDeviceQueue : public DeviceQueue {
+ public:
+ explicit OneapiDeviceQueue(OneapiDevice *device);
+ ~OneapiDeviceQueue();
+
+ virtual int num_concurrent_states(const size_t state_size) const override;
+
+ virtual int num_concurrent_busy_states() const override;
+
+ virtual void init_execution() override;
+
+ virtual bool enqueue(DeviceKernel kernel,
+ const int kernel_work_size,
+ DeviceKernelArguments const &args) override;
+
+ virtual bool synchronize() override;
+
+ virtual void zero_to_device(device_memory &mem) override;
+ virtual void copy_to_device(device_memory &mem) override;
+ virtual void copy_from_device(device_memory &mem) override;
+
+ protected:
+ OneapiDevice *oneapi_device_;
+ OneAPIDLLInterface oneapi_dll_;
+ KernelContext *kernel_context_;
+ bool with_kernel_statistics_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
index 70810bae10d..68ca21374fd 100644
--- a/intern/cycles/device/optix/device.cpp
+++ b/intern/cycles/device/optix/device.cpp
@@ -31,12 +31,12 @@ bool device_optix_init()
const OptixResult result = optixInit();
if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
- VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
- "Please update to the latest driver first!";
+ VLOG_WARNING << "OptiX initialization failed because the installed NVIDIA driver is too old. "
+ "Please update to the latest driver first!";
return false;
}
else if (result != OPTIX_SUCCESS) {
- VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+ VLOG_WARNING << "OptiX initialization failed with error code " << (unsigned int)result;
return false;
}
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 9ab9bbb59c5..6c64e7106d5 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -26,7 +26,6 @@
# include "util/task.h"
# include "util/time.h"
-# undef __KERNEL_CPU__
# define __KERNEL_OPTIX__
# include "kernel/device/optix/globals.h"
@@ -40,6 +39,9 @@ CCL_NAMESPACE_BEGIN
// The original code is Copyright NVIDIA Corporation, BSD-3-Clause.
namespace {
+# if OPTIX_ABI_VERSION >= 60
+using ::optixUtilDenoiserInvokeTiled;
+# else
static OptixResult optixUtilDenoiserSplitImage(const OptixImage2D &input,
const OptixImage2D &output,
unsigned int overlapWindowSizeInPixels,
@@ -216,6 +218,7 @@ static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser,
}
return OPTIX_SUCCESS;
}
+# endif
# if OPTIX_ABI_VERSION >= 55
static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason)
@@ -246,7 +249,7 @@ OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: CUDADevice(info, stats, profiler),
sbt_data(this, "__sbt", MEM_READ_ONLY),
- launch_params(this, "__params", false),
+ launch_params(this, "kernel_params", false),
denoiser_(this)
{
/* Make the CUDA context current. */
@@ -278,7 +281,7 @@ OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
};
# endif
if (DebugFlags().optix.use_debug) {
- VLOG(1) << "Using OptiX debug mode.";
+ VLOG_INFO << "Using OptiX debug mode.";
options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
}
optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
@@ -339,15 +342,29 @@ BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
return BVH_LAYOUT_OPTIX;
}
+static string get_optix_include_dir()
+{
+ const char *env_dir = getenv("OPTIX_ROOT_DIR");
+ const char *default_dir = CYCLES_RUNTIME_OPTIX_ROOT_DIR;
+
+ if (env_dir && env_dir[0]) {
+ const string env_include_dir = path_join(env_dir, "include");
+ return env_include_dir;
+ }
+ else if (default_dir[0]) {
+ const string default_include_dir = path_join(default_dir, "include");
+ return default_include_dir;
+ }
+
+ return string();
+}
+
string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
{
string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
/* Add OptiX SDK include directory to include paths. */
- const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
- if (optix_sdk_path) {
- common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
- }
+ common_cflags += string_printf(" -I\"%s\"", get_optix_include_dir().c_str());
/* Specialization for shader raytracing. */
if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
@@ -421,7 +438,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
pipeline_options.numPayloadValues = 8;
pipeline_options.numAttributeValues = 2; /* u, v */
pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
- pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
+ pipeline_options.pipelineLaunchParamsVariableName = "kernel_params"; /* See globals.h */
pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
if (kernel_features & KERNEL_FEATURE_HAIR) {
@@ -457,10 +474,19 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
"lib/kernel_optix_shader_raytrace.ptx" :
"lib/kernel_optix.ptx");
if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
- if (!getenv("OPTIX_ROOT_DIR")) {
+ std::string optix_include_dir = get_optix_include_dir();
+ if (optix_include_dir.empty()) {
set_error(
- "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
- "the Optix SDK to be able to compile Optix kernels on demand).");
+ "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable "
+ "to a directory containing the OptiX SDK.");
+ return false;
+ }
+ else if (!path_is_directory(optix_include_dir)) {
+ set_error(string_printf(
+ "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install "
+ "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a "
+ "directory containing the OptiX SDK.",
+ optix_include_dir.c_str()));
return false;
}
ptx_filename = compile_kernel(
@@ -1390,13 +1416,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
options.operation = operation;
if (use_fast_trace_bvh ||
/* The build flags have to match the ones used to query the built-in curve intersection
- program (see optixBuiltinISModuleGet above) */
+ * program (see optixBuiltinISModuleGet above) */
build_input.type == OPTIX_BUILD_INPUT_TYPE_CURVES) {
- VLOG(2) << "Using fast to trace OptiX BVH";
+ VLOG_INFO << "Using fast to trace OptiX BVH";
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
}
else {
- VLOG(2) << "Using fast to update OptiX BVH";
+ VLOG_INFO << "Using fast to update OptiX BVH";
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
}
@@ -2042,26 +2068,26 @@ void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
/* Set constant memory for CUDA module. */
CUDADevice::const_copy_to(name, host, size);
- if (strcmp(name, "__data") == 0) {
+ if (strcmp(name, "data") == 0) {
assert(size <= sizeof(KernelData));
/* Update traversable handle (since it is different for each device on multi devices). */
KernelData *const data = (KernelData *)host;
- *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+ *(OptixTraversableHandle *)&data->device_bvh = tlas_handle;
update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
return;
}
/* Update data storage pointers in launch parameters. */
-# define KERNEL_TEX(data_type, tex_name) \
- if (strcmp(name, #tex_name) == 0) { \
- update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
+# define KERNEL_DATA_ARRAY(data_type, data_name) \
+ if (strcmp(name, #data_name) == 0) { \
+ update_launch_params(offsetof(KernelParamsOptiX, data_name), host, size); \
return; \
}
- KERNEL_TEX(IntegratorStateGPU, __integrator_state)
-# include "kernel/textures.h"
-# undef KERNEL_TEX
+ KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
+# include "kernel/data_arrays.h"
+# undef KERNEL_DATA_ARRAY
}
void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 366bf95269d..f0d49ad6f6c 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -8,7 +8,6 @@
# include "util/time.h"
-# undef __KERNEL_CPU__
# define __KERNEL_OPTIX__
# include "kernel/device/optix/globals.h"
diff --git a/intern/cycles/device/queue.cpp b/intern/cycles/device/queue.cpp
index de65047ed6a..cc0cf0ccf84 100644
--- a/intern/cycles/device/queue.cpp
+++ b/intern/cycles/device/queue.cpp
@@ -19,7 +19,7 @@ DeviceQueue::DeviceQueue(Device *device)
DeviceQueue::~DeviceQueue()
{
- if (VLOG_IS_ON(3)) {
+ if (VLOG_DEVICE_STATS_IS_ON) {
/* Print kernel execution times sorted by time. */
vector<pair<DeviceKernelMask, double>> stats_sorted;
for (const auto &stat : stats_kernel_time_) {
@@ -32,17 +32,18 @@ DeviceQueue::~DeviceQueue()
return a.second > b.second;
});
- VLOG(3) << "GPU queue stats:";
+ VLOG_DEVICE_STATS << "GPU queue stats:";
for (const auto &[mask, time] : stats_sorted) {
- VLOG(3) << " " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
- << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+ VLOG_DEVICE_STATS << " " << std::setfill(' ') << std::setw(10) << std::fixed
+ << std::setprecision(5) << std::right << time
+ << "s: " << device_kernel_mask_as_string(mask);
}
}
}
void DeviceQueue::debug_init_execution()
{
- if (VLOG_IS_ON(3)) {
+ if (VLOG_DEVICE_STATS_IS_ON) {
last_sync_time_ = time_dt();
}
@@ -51,9 +52,9 @@ void DeviceQueue::debug_init_execution()
void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
{
- if (VLOG_IS_ON(3)) {
- VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
- << work_size;
+ if (VLOG_DEVICE_STATS_IS_ON) {
+ VLOG_DEVICE_STATS << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+ << work_size;
}
last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
@@ -61,10 +62,10 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
void DeviceQueue::debug_synchronize()
{
- if (VLOG_IS_ON(3)) {
+ if (VLOG_DEVICE_STATS_IS_ON) {
const double new_time = time_dt();
const double elapsed_time = new_time - last_sync_time_;
- VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+ VLOG_DEVICE_STATS << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 14a5db3a204..808431af401 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -105,6 +105,13 @@ class DeviceQueue {
* value. */
virtual int num_concurrent_busy_states() const = 0;
+ /* Number of elements in a partition of sorted shaders, that improves memory locality of
+ * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
+ virtual int num_sort_partition_elements() const
+ {
+ return 65536;
+ }
+
/* Initialize execution of kernels on this queue.
*
* Will, for example, load all data required by the kernels from Device to global or path state.