Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorXavier Hallade <xavier.hallade@intel.com>2022-07-27 10:38:19 +0300
committerXavier Hallade <xavier.hallade@intel.com>2022-07-27 10:45:33 +0300
commitd706d0460c5721e2b07f18ab6354754267628130 (patch)
treedb042aabbfb3a74a3d6f20e93ebffc854a854fa1 /intern
parent38e270ae30d97a171e72af0359d34d19a647489d (diff)
Cycles oneAPI: simplify num_concurrent_states selection
The number of Execution Units and resident "threads" (simd width * threads per EUs) are now exposed and used to select the number of states using a simplified heuristic.
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/device/oneapi/device_impl.cpp12
-rw-r--r--intern/cycles/device/oneapi/device_impl.h3
-rw-r--r--intern/cycles/device/oneapi/queue.cpp43
-rw-r--r--intern/cycles/kernel/device/oneapi/dll_interface_template.h3
-rw-r--r--intern/cycles/kernel/device/oneapi/kernel.cpp23
5 files changed, 43 insertions, 41 deletions
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index 0c0afd1d2df..bdcc15bba56 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -402,6 +402,18 @@ unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
return make_unique<OneapiDeviceQueue>(this);
}
+int OneapiDevice::get_num_multiprocessors()
+{
+ assert(device_queue_);
+ return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_);
+}
+
+int OneapiDevice::get_max_num_threads_per_multiprocessor()
+{
+ assert(device_queue_);
+ return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_);
+}
+
bool OneapiDevice::should_use_graphics_interop()
{
/* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
index f925687ebe9..a0a747a3cf2 100644
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -89,6 +89,9 @@ class OneapiDevice : public Device {
virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+ int get_num_multiprocessors();
+ int get_max_num_threads_per_multiprocessor();
+
/* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
* side compilation (MSVC). */
void *usm_aligned_alloc_host(size_t memory_size, size_t alignment);
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
index 42e2408ee7a..1e822e25f1a 100644
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -36,34 +36,9 @@ OneapiDeviceQueue::~OneapiDeviceQueue()
int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
{
- int num_states;
-
- /* TODO: implement and use get_num_multiprocessors and get_max_num_threads_per_multiprocessor. */
- const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
- oneapi_device_->sycl_queue());
- if (compute_units >= 128) {
- /* dGPU path, make sense to allocate more states, because it will be dedicated GPU memory. */
- int base = 1024 * 1024;
- /* linear dependency (with coefficient less that 1) from amount of compute units. */
- num_states = (base * (compute_units / 128)) * 3 / 4;
-
- /* Limit amount of integrator states by one quarter of device memory, because
- * other allocations will need some space as well
- * TODO: base this calculation on the how many states what the GPU is actually capable of
- * running, with some headroom to improve occupancy. If the texture don't fit, offload into
- * unified memory. */
- size_t states_memory_size = num_states * state_size;
- size_t device_memory_amount =
- (oneapi_dll_.oneapi_get_memcapacity)(oneapi_device_->sycl_queue());
- if (states_memory_size >= device_memory_amount / 4) {
- num_states = device_memory_amount / 4 / state_size;
- }
- }
- else {
- /* iGPU path - no real need to allocate a lot of integrator states because it is shared GPU
- * memory. */
- num_states = 1024 * 512;
- }
+ const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
+ oneapi_device_->get_max_num_threads_per_multiprocessor();
+ int num_states = max(8 * max_num_threads, 65536) * 16;
VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
<< string_human_readable_size(num_states * state_size);
@@ -73,14 +48,10 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
int OneapiDeviceQueue::num_concurrent_busy_states() const
{
- const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
- oneapi_device_->sycl_queue());
- if (compute_units >= 128) {
- return 1024 * 1024;
- }
- else {
- return 1024 * 512;
- }
+ const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
+ oneapi_device_->get_max_num_threads_per_multiprocessor();
+
+ return 4 * max(8 * max_num_threads, 65536);
}
void OneapiDeviceQueue::init_execution()
diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
index 662068c0fed..5dd0d4203a4 100644
--- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h
+++ b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
@@ -6,7 +6,8 @@ DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
DLL_INTERFACE_CALL(oneapi_free, void, void *)
DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_get_num_multiprocessors, int, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_get_max_num_threads_per_multiprocessor, int, SyclQueue *queue)
DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
index 300e201600c..7e90c553c44 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -904,11 +904,26 @@ size_t oneapi_get_memcapacity(SyclQueue *queue)
.get_info<sycl::info::device::global_mem_size>();
}
-size_t oneapi_get_compute_units_amount(SyclQueue *queue)
+int oneapi_get_num_multiprocessors(SyclQueue *queue)
{
- return reinterpret_cast<sycl::queue *>(queue)
- ->get_device()
- .get_info<sycl::info::device::max_compute_units>();
+ const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
+ if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+ return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+ }
+ else
+ return 0;
+}
+
+int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue)
+{
+ const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
+ if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
+ device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+ return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
+ device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+ }
+ else
+ return 0;
}
#endif /* WITH_ONEAPI */