Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/CMakeLists.txt3
-rw-r--r--intern/cycles/device/cuda/queue.cpp2
-rw-r--r--intern/cycles/device/cuda/queue.h2
-rw-r--r--intern/cycles/device/hip/queue.cpp2
-rw-r--r--intern/cycles/device/hip/queue.h2
-rw-r--r--intern/cycles/device/metal/device_impl.mm2
-rw-r--r--intern/cycles/device/metal/kernel.mm68
-rw-r--r--intern/cycles/device/metal/queue.h2
-rw-r--r--intern/cycles/device/metal/queue.mm51
-rw-r--r--intern/cycles/device/oneapi/device_impl.cpp24
-rw-r--r--intern/cycles/device/oneapi/queue.cpp2
-rw-r--r--intern/cycles/device/oneapi/queue.h2
-rw-r--r--intern/cycles/device/queue.h2
13 files changed, 111 insertions, 53 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 5296d819e42..bfca3ab6aea 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -224,7 +224,8 @@ include_directories(SYSTEM ${INC_SYS})
cycles_add_library(cycles_device "${LIB}" ${SRC})
if(WITH_CYCLES_DEVICE_ONEAPI)
- # Need to have proper rebuilding in case of changes in cycles_kernel_oneapi due external project behaviour
+ # Need to have proper rebuilding in case of changes
+ # in cycles_kernel_oneapi due external project behavior.
add_dependencies(cycles_device cycles_kernel_oneapi)
endif()
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 84b0a1e0dd6..69fae03e32c 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
return num_states;
}
-int CUDADeviceQueue::num_concurrent_busy_states() const
+int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
{
const int max_num_threads = cuda_device_->get_num_multiprocessors() *
cuda_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index b450f5b3592..7107afe70c9 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue {
~CUDADeviceQueue();
virtual int num_concurrent_states(const size_t state_size) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t state_size) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 3f8b6267100..e93a9b4df3a 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
return num_states;
}
-int HIPDeviceQueue::num_concurrent_busy_states() const
+int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
{
const int max_num_threads = hip_device_->get_num_multiprocessors() *
hip_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
index 729d8a19acb..df0678108af 100644
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue {
~HIPDeviceQueue();
virtual int num_concurrent_states(const size_t state_size) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t state_size) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 4b929b6bc0a..6f1042b1e55 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -296,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
}
source = global_defines + source;
+# if 0
metal_printf("================\n%s================\n\%s================\n",
global_defines.c_str(),
baked_constants.c_str());
+# endif
/* Generate an MD5 from the source and include any baked constants. This is used when caching
* PSOs. */
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 8ccc50e57a3..dc8af9a5358 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel)
struct ShaderCache {
ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
{
+ /* Initialize occupancy tuning LUT. */
+ if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) {
+ switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
+ default:
+ case APPLE_M2:
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
+ break;
+ case APPLE_M1:
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
+ break;
+ }
+ }
}
~ShaderCache();
@@ -73,6 +103,11 @@ struct ShaderCache {
std::function<void(MetalKernelPipeline *)> completionHandler;
};
+ struct OccupancyTuningParameters {
+ int threads_per_threadgroup = 0;
+ int num_threads_per_block = 0;
+ } occupancy_tuning[DEVICE_KERNEL_NUM];
+
std::mutex cache_mutex;
PipelineCollection pipelines[DEVICE_KERNEL_NUM];
@@ -162,6 +197,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
}
}
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+ if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
+ /* Skip shade_surface_mnee kernel if the scene doesn't require it. */
+ return false;
+ }
+ }
+
if (pso_type != PSO_GENERIC) {
/* Only specialize kernels where it can make an impact. */
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
@@ -223,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
request.pipeline->device_kernel = device_kernel;
request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
+ if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
+ request.pipeline->threads_per_threadgroup =
+ occupancy_tuning[device_kernel].threads_per_threadgroup;
+ request.pipeline->num_threads_per_block =
+ occupancy_tuning[device_kernel].num_threads_per_block;
+ }
+
/* metalrt options */
request.pipeline->use_metalrt = device->use_metalrt;
request.pipeline->metalrt_hair = device->use_metalrt &&
@@ -367,13 +416,6 @@ void MetalKernelPipeline::compile()
const std::string function_name = std::string("cycles_metal_") +
device_kernel_as_string(device_kernel);
- int threads_per_threadgroup = this->threads_per_threadgroup;
- if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
- device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
- /* Always use 512 for the sorting kernels */
- threads_per_threadgroup = 512;
- }
-
NSString *entryPoint = [@(function_name.c_str()) copy];
NSError *error = NULL;
@@ -637,12 +679,14 @@ void MetalKernelPipeline::compile()
return;
}
- int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
- computePipelineState.threadExecutionWidth);
- num_threads_per_block = std::max(num_threads_per_block,
- (int)computePipelineState.threadExecutionWidth);
+ if (!num_threads_per_block) {
+ num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+ computePipelineState.threadExecutionWidth);
+ num_threads_per_block = std::max(num_threads_per_block,
+ (int)computePipelineState.threadExecutionWidth);
+ }
+
this->pipeline = computePipelineState;
- this->num_threads_per_block = num_threads_per_block;
if (@available(macOS 11.0, *)) {
if (creating_new_archive || recreate_archive) {
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index fc32740f3e1..2a6c12e2a60 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue {
~MetalDeviceQueue();
virtual int num_concurrent_states(const size_t) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t) const override;
virtual int num_sort_partition_elements() const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 5ac63a16c61..c0df2c8553f 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue()
}
}
-int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
{
- /* METAL_WIP */
- /* TODO: compute automatically. */
- /* TODO: must have at least num_threads_per_block. */
- int result = 1048576;
- if (metal_device_->device_vendor == METAL_GPU_AMD) {
- result *= 2;
+ static int result = 0;
+ if (result) {
+ return result;
}
- else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
+
+ result = 1048576;
+ if (metal_device_->device_vendor == METAL_GPU_APPLE) {
result *= 4;
+
+ if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) {
+ size_t system_ram = system_physical_ram();
+ size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
+ size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
+
+ /* Determine whether we can double the state count, and leave enough GPU-available memory
+ * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
+ * keep dispatch sizes high and minimize work submission overheads. */
+ size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
+ size_t total_state_size = result * state_size;
+ if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
+ result *= 2;
+ metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
+ }
+ }
+ }
+ else if (metal_device_->device_vendor == METAL_GPU_AMD) {
+ /* METAL_WIP */
+ /* TODO: compute automatically. */
+ /* TODO: must have at least num_threads_per_block. */
+ result *= 2;
}
return result;
}
-int MetalDeviceQueue::num_concurrent_busy_states() const
+int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
{
- /* METAL_WIP */
- /* TODO: compute automatically. */
- int result = 65536;
- if (metal_device_->device_vendor == METAL_GPU_AMD) {
- result *= 2;
- }
- else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
- result *= 4;
- }
- return result;
+ /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
+ return num_concurrent_states(state_size) / 4;
}
int MetalDeviceQueue::num_sort_partition_elements() const
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index 3588b75713b..d0ddd69289c 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -668,8 +668,9 @@ int OneapiDevice::parse_driver_build_version(const sycl::device &device)
std::vector<sycl::device> OneapiDevice::available_devices()
{
bool allow_all_devices = false;
- if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
+ if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) {
allow_all_devices = true;
+ }
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
@@ -686,15 +687,16 @@ std::vector<sycl::device> OneapiDevice::available_devices()
platform.get_devices(sycl::info::device_type::gpu);
for (const sycl::device &device : oneapi_devices) {
+ bool filter_out = false;
if (!allow_all_devices) {
- bool filter_out = false;
-
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
* assuming they have either more than 96 Execution Units or not 7 threads per EU.
* Official support can be broaden to older and smaller GPUs once ready. */
- if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
- /* Filtered-out defaults in-case these values aren't available through too old L0
- * runtime. */
+ if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
+ filter_out = true;
+ }
+ else {
+ /* Filtered-out defaults in-case these values aren't available. */
int number_of_eus = 96;
int threads_per_eu = 7;
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
@@ -718,13 +720,9 @@ std::vector<sycl::device> OneapiDevice::available_devices()
}
}
}
- else if (!allow_all_devices) {
- filter_out = true;
- }
-
- if (!filter_out) {
- available_devices.push_back(device);
- }
+ }
+ if (!filter_out) {
+ available_devices.push_back(device);
}
}
}
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
index 9632b14d485..3d019661aa8 100644
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
return num_states;
}
-int OneapiDeviceQueue::num_concurrent_busy_states() const
+int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
{
const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
oneapi_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
index 32363bf2a6e..bbd947b49cb 100644
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t state_size) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t state_size) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 1d6a8d736b7..e27e081a407 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -103,7 +103,7 @@ class DeviceQueue {
/* Number of states which keeps the device occupied with work without losing performance.
* The renderer will add more work (when available) when number of active paths falls below this
* value. */
- virtual int num_concurrent_busy_states() const = 0;
+ virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
/* Number of elements in a partition of sorted shaders, that improves memory locality of
* integrator state fetch at the cost of decreased coherence for shader kernel execution. */