Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/CMakeLists.txt22
-rw-r--r--intern/cycles/device/cuda/queue.cpp2
-rw-r--r--intern/cycles/device/cuda/queue.h2
-rw-r--r--intern/cycles/device/hip/queue.cpp2
-rw-r--r--intern/cycles/device/hip/queue.h2
-rw-r--r--intern/cycles/device/metal/device_impl.mm10
-rw-r--r--intern/cycles/device/metal/kernel.mm13
-rw-r--r--intern/cycles/device/metal/queue.h2
-rw-r--r--intern/cycles/device/metal/queue.mm51
-rw-r--r--intern/cycles/device/metal/util.mm6
-rw-r--r--intern/cycles/device/oneapi/device.cpp4
-rw-r--r--intern/cycles/device/oneapi/device_impl.cpp69
-rw-r--r--intern/cycles/device/oneapi/device_impl.h2
-rw-r--r--intern/cycles/device/oneapi/queue.cpp2
-rw-r--r--intern/cycles/device/oneapi/queue.h2
-rw-r--r--intern/cycles/device/queue.h2
16 files changed, 109 insertions, 84 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 5516e97f34f..5296d819e42 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -187,18 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL)
)
endif()
if (WITH_CYCLES_DEVICE_ONEAPI)
+ if(WITH_CYCLES_ONEAPI_BINARIES)
+ set(cycles_kernel_oneapi_lib_suffix "_aot")
+ else()
+ set(cycles_kernel_oneapi_lib_suffix "_jit")
+ endif()
if(WIN32)
- set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib)
+ set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.lib)
else()
- set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so)
+ set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.so)
+ endif()
+ list(APPEND LIB ${cycles_kernel_oneapi_lib})
+ if(WIN32)
+ list(APPEND LIB debug ${SYCL_LIBRARY_DEBUG} optimized ${SYCL_LIBRARY})
+ else()
+ list(APPEND LIB ${SYCL_LIBRARY})
endif()
- list(APPEND LIB
- ${cycles_kernel_oneapi_lib}
- "$<$<CONFIG:Debug>:${SYCL_LIBRARY_DEBUG}>"
- "$<$<CONFIG:Release>:${SYCL_LIBRARY}>"
- "$<$<CONFIG:RelWithDebInfo>:${SYCL_LIBRARY}>"
- "$<$<CONFIG:MinSizeRel>:${SYCL_LIBRARY}>"
- )
add_definitions(-DWITH_ONEAPI)
list(APPEND SRC
${SRC_ONEAPI}
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 84b0a1e0dd6..69fae03e32c 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
return num_states;
}
-int CUDADeviceQueue::num_concurrent_busy_states() const
+int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
{
const int max_num_threads = cuda_device_->get_num_multiprocessors() *
cuda_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index b450f5b3592..7107afe70c9 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue {
~CUDADeviceQueue();
virtual int num_concurrent_states(const size_t state_size) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t state_size) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 3f8b6267100..e93a9b4df3a 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
return num_states;
}
-int HIPDeviceQueue::num_concurrent_busy_states() const
+int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
{
const int max_num_threads = hip_device_->get_num_multiprocessors() *
hip_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
index 729d8a19acb..df0678108af 100644
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue {
~HIPDeviceQueue();
virtual int num_concurrent_states(const size_t state_size) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t state_size) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 6a16d4bb3b4..6f1042b1e55 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -296,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
}
source = global_defines + source;
+# if 0
metal_printf("================\n%s================\n\%s================\n",
global_defines.c_str(),
baked_constants.c_str());
+# endif
/* Generate an MD5 from the source and include any baked constants. This is used when caching
* PSOs. */
@@ -339,6 +341,14 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
+# if defined(MAC_OS_VERSION_13_0)
+ if (@available(macos 13.0, *)) {
+ if (device_vendor == METAL_GPU_INTEL) {
+ [options setOptimizationLevel:MTLLibraryOptimizationLevelSize];
+ }
+ }
+# endif
+
options.fastMathEnabled = YES;
if (@available(macOS 12.0, *)) {
options.languageVersion = MTLLanguageVersion2_4;
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 5e0cb6d18f4..55938d1a03a 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
}
}
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+ if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
+ /* Skip shade_surface_mnee kernel if the scene doesn't require it. */
+ return false;
+ }
+ }
+
if (pso_type != PSO_GENERIC) {
/* Only specialize kernels where it can make an impact. */
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
@@ -317,6 +324,12 @@ bool MetalKernelPipeline::should_use_binary_archive() const
}
}
+ /* Workaround for Intel GPU having issue using Binary Archives */
+ MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice);
+ if (gpu_vendor == METAL_GPU_INTEL) {
+ return false;
+ }
+
if (pso_type == PSO_GENERIC) {
/* Archive the generic kernels. */
return true;
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index fc32740f3e1..2a6c12e2a60 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue {
~MetalDeviceQueue();
virtual int num_concurrent_states(const size_t) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t) const override;
virtual int num_sort_partition_elements() const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 5ac63a16c61..c0df2c8553f 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue()
}
}
-int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
{
- /* METAL_WIP */
- /* TODO: compute automatically. */
- /* TODO: must have at least num_threads_per_block. */
- int result = 1048576;
- if (metal_device_->device_vendor == METAL_GPU_AMD) {
- result *= 2;
+ static int result = 0;
+ if (result) {
+ return result;
}
- else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
+
+ result = 1048576;
+ if (metal_device_->device_vendor == METAL_GPU_APPLE) {
result *= 4;
+
+ if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) {
+ size_t system_ram = system_physical_ram();
+ size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
+ size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
+
+ /* Determine whether we can double the state count, and leave enough GPU-available memory
+ * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
+ * keep dispatch sizes high and minimize work submission overheads. */
+ size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
+ size_t total_state_size = result * state_size;
+ if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
+ result *= 2;
+ metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
+ }
+ }
+ }
+ else if (metal_device_->device_vendor == METAL_GPU_AMD) {
+ /* METAL_WIP */
+ /* TODO: compute automatically. */
+ /* TODO: must have at least num_threads_per_block. */
+ result *= 2;
}
return result;
}
-int MetalDeviceQueue::num_concurrent_busy_states() const
+int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
{
- /* METAL_WIP */
- /* TODO: compute automatically. */
- int result = 65536;
- if (metal_device_->device_vendor == METAL_GPU_AMD) {
- result *= 2;
- }
- else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
- result *= 4;
- }
- return result;
+ /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
+ return num_concurrent_states(state_size) / 4;
}
int MetalDeviceQueue::num_sort_partition_elements() const
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index 65c67c400fe..f47638fac15 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -110,6 +110,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
usable |= (vendor == METAL_GPU_AMD);
}
+# if defined(MAC_OS_VERSION_13_0)
+ if (@available(macos 13.0, *)) {
+ usable |= (vendor == METAL_GPU_INTEL);
+ }
+# endif
+
if (usable) {
metal_printf("- %s\n", device_name.c_str());
[device retain];
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
index f303ab41627..66d6f749e30 100644
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -39,7 +39,7 @@ bool device_oneapi_init()
_putenv_s("SYCL_CACHE_THRESHOLD", "0");
}
if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
- _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
+ _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
}
if (getenv("SYCL_ENABLE_PCI") == nullptr) {
_putenv_s("SYCL_ENABLE_PCI", "1");
@@ -50,7 +50,7 @@ bool device_oneapi_init()
# elif __linux__
setenv("SYCL_CACHE_PERSISTENT", "1", false);
setenv("SYCL_CACHE_THRESHOLD", "0", false);
- setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
+ setenv("SYCL_DEVICE_FILTER", "level_zero", false);
setenv("SYCL_ENABLE_PCI", "1", false);
setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
# endif
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index 91f53fd1eae..d0ddd69289c 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -430,9 +430,9 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
(void)usm_type;
assert(usm_type == sycl::usm::alloc::device ||
- ((device_type == sycl::info::device_type::host ||
- device_type == sycl::info::device_type::cpu || allow_host) &&
- usm_type == sycl::usm::alloc::host));
+ ((device_type == sycl::info::device_type::cpu || allow_host) &&
+ usm_type == sycl::usm::alloc::host ||
+ usm_type == sycl::usm::alloc::unknown));
# else
/* Silence warning about unused arguments. */
(void)queue_;
@@ -668,16 +668,9 @@ int OneapiDevice::parse_driver_build_version(const sycl::device &device)
std::vector<sycl::device> OneapiDevice::available_devices()
{
bool allow_all_devices = false;
- if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
+ if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) {
allow_all_devices = true;
-
- /* Host device is useful only for debugging at the moment
- * so we hide this device with default build settings. */
-# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
- bool allow_host = true;
-# else
- bool allow_host = false;
-# endif
+ }
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
@@ -690,33 +683,28 @@ std::vector<sycl::device> OneapiDevice::available_devices()
}
const std::vector<sycl::device> &oneapi_devices =
- (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
- platform.get_devices(sycl::info::device_type::gpu);
+ (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
+ platform.get_devices(sycl::info::device_type::gpu);
for (const sycl::device &device : oneapi_devices) {
- if (allow_all_devices) {
- /* still filter out host device if build doesn't support it. */
- if (allow_host || !device.is_host()) {
- available_devices.push_back(device);
- }
- }
- else {
- bool filter_out = false;
-
+ bool filter_out = false;
+ if (!allow_all_devices) {
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
* assuming they have either more than 96 Execution Units or not 7 threads per EU.
* Official support can be broaden to older and smaller GPUs once ready. */
- if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
- /* Filtered-out defaults in-case these values aren't available through too old L0
- * runtime. */
+ if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
+ filter_out = true;
+ }
+ else {
+ /* Filtered-out defaults in-case these values aren't available. */
int number_of_eus = 96;
int threads_per_eu = 7;
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
- number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+ number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
}
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
threads_per_eu =
- device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+ device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
}
/* This filters out all Level-Zero supported GPUs from older generation than Arc. */
if (number_of_eus <= 96 && threads_per_eu == 7) {
@@ -732,16 +720,9 @@ std::vector<sycl::device> OneapiDevice::available_devices()
}
}
}
- else if (!allow_host && device.is_host()) {
- filter_out = true;
- }
- else if (!allow_all_devices) {
- filter_out = true;
- }
-
- if (!filter_out) {
- available_devices.push_back(device);
- }
+ }
+ if (!filter_out) {
+ available_devices.push_back(device);
}
}
}
@@ -797,9 +778,7 @@ char *OneapiDevice::device_capabilities()
GET_NUM_ATTR(native_vector_width_double)
GET_NUM_ATTR(native_vector_width_half)
- size_t max_clock_frequency =
- (size_t)(device.is_host() ? (size_t)0 :
- device.get_info<sycl::info::device::max_clock_frequency>());
+ size_t max_clock_frequency = device.get_info<sycl::info::device::max_clock_frequency>();
WRITE_ATTR("max_clock_frequency", max_clock_frequency)
GET_NUM_ATTR(address_bits)
@@ -837,7 +816,7 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
std::string name = device.get_info<sycl::info::device::name>();
std::string id = "ONEAPI_" + platform_name + "_" + name;
if (device.has(sycl::aspect::ext_intel_pci_address)) {
- id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
+ id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
}
(cb)(id.c_str(), name.c_str(), num, user_ptr);
num++;
@@ -855,7 +834,7 @@ int OneapiDevice::get_num_multiprocessors()
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
- return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+ return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
}
else
return 0;
@@ -866,8 +845,8 @@ int OneapiDevice::get_max_num_threads_per_multiprocessor()
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
- return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
- device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+ return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() *
+ device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
}
else
return 0;
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
index 62034150eac..197cf03d60d 100644
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -3,7 +3,7 @@
#ifdef WITH_ONEAPI
-# include <CL/sycl.hpp>
+# include <sycl/sycl.hpp>
# include "device/device.h"
# include "device/oneapi/device.h"
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
index 9632b14d485..3d019661aa8 100644
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
return num_states;
}
-int OneapiDeviceQueue::num_concurrent_busy_states() const
+int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
{
const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
oneapi_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
index 32363bf2a6e..bbd947b49cb 100644
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t state_size) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t state_size) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 1d6a8d736b7..e27e081a407 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -103,7 +103,7 @@ class DeviceQueue {
/* Number of states which keeps the device occupied with work without losing performance.
* The renderer will add more work (when available) when number of active paths falls below this
* value. */
- virtual int num_concurrent_busy_states() const = 0;
+ virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
/* Number of elements in a partition of sorted shaders, that improves memory locality of
* integrator state fetch at the cost of decreased coherence for shader kernel execution. */