diff options
author | Xavier Hallade <xavier.hallade@intel.com> | 2022-07-27 10:38:19 +0300 |
---|---|---|
committer | Xavier Hallade <xavier.hallade@intel.com> | 2022-07-27 10:45:33 +0300 |
commit | d706d0460c5721e2b07f18ab6354754267628130 (patch) | |
tree | db042aabbfb3a74a3d6f20e93ebffc854a854fa1 /intern | |
parent | 38e270ae30d97a171e72af0359d34d19a647489d (diff) |
Cycles oneAPI: simplify num_concurrent_states selection
The number of Execution Units and resident "threads" (simd width * threads
per EUs) are now exposed and used to select the number of states using
a simplified heuristic.
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/device/oneapi/device_impl.cpp | 12 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/device_impl.h | 3 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/queue.cpp | 43 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/dll_interface_template.h | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/kernel.cpp | 23 |
5 files changed, 43 insertions, 41 deletions
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index 0c0afd1d2df..bdcc15bba56 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -402,6 +402,18 @@ unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create() return make_unique<OneapiDeviceQueue>(this); } +int OneapiDevice::get_num_multiprocessors() +{ + assert(device_queue_); + return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_); +} + +int OneapiDevice::get_max_num_threads_per_multiprocessor() +{ + assert(device_queue_); + return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_); +} + bool OneapiDevice::should_use_graphics_interop() { /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h index f925687ebe9..a0a747a3cf2 100644 --- a/intern/cycles/device/oneapi/device_impl.h +++ b/intern/cycles/device/oneapi/device_impl.h @@ -89,6 +89,9 @@ class OneapiDevice : public Device { virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + int get_num_multiprocessors(); + int get_max_num_threads_per_multiprocessor(); + /* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host * side compilation (MSVC). */ void *usm_aligned_alloc_host(size_t memory_size, size_t alignment); diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp index 42e2408ee7a..1e822e25f1a 100644 --- a/intern/cycles/device/oneapi/queue.cpp +++ b/intern/cycles/device/oneapi/queue.cpp @@ -36,34 +36,9 @@ OneapiDeviceQueue::~OneapiDeviceQueue() int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const { - int num_states; - - /* TODO: implement and use get_num_multiprocessors and get_max_num_threads_per_multiprocessor. */ - const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount( - oneapi_device_->sycl_queue()); - if (compute_units >= 128) { - /* dGPU path, make sense to allocate more states, because it will be dedicated GPU memory. */ - int base = 1024 * 1024; - /* linear dependency (with coefficient less that 1) from amount of compute units. */ - num_states = (base * (compute_units / 128)) * 3 / 4; - - /* Limit amount of integrator states by one quarter of device memory, because - * other allocations will need some space as well - * TODO: base this calculation on the how many states what the GPU is actually capable of - * running, with some headroom to improve occupancy. If the texture don't fit, offload into - * unified memory. */ - size_t states_memory_size = num_states * state_size; - size_t device_memory_amount = - (oneapi_dll_.oneapi_get_memcapacity)(oneapi_device_->sycl_queue()); - if (states_memory_size >= device_memory_amount / 4) { - num_states = device_memory_amount / 4 / state_size; - } - } - else { - /* iGPU path - no real need to allocate a lot of integrator states because it is shared GPU - * memory. */ - num_states = 1024 * 512; - } + const int max_num_threads = oneapi_device_->get_num_multiprocessors() * + oneapi_device_->get_max_num_threads_per_multiprocessor(); + int num_states = max(8 * max_num_threads, 65536) * 16; VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to " << string_human_readable_size(num_states * state_size); @@ -73,14 +48,10 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const int OneapiDeviceQueue::num_concurrent_busy_states() const { - const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount( - oneapi_device_->sycl_queue()); - if (compute_units >= 128) { - return 1024 * 1024; - } - else { - return 1024 * 512; - } + const int max_num_threads = oneapi_device_->get_num_multiprocessors() * + oneapi_device_->get_max_num_threads_per_multiprocessor(); + + return 4 * max(8 * max_num_threads, 65536); } void OneapiDeviceQueue::init_execution() diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h index 662068c0fed..5dd0d4203a4 100644 --- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h +++ b/intern/cycles/kernel/device/oneapi/dll_interface_template.h @@ -6,7 +6,8 @@ DLL_INTERFACE_CALL(oneapi_device_capabilities, char *) DLL_INTERFACE_CALL(oneapi_free, void, void *) DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue) -DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue) +DLL_INTERFACE_CALL(oneapi_get_num_multiprocessors, int, SyclQueue *queue) +DLL_INTERFACE_CALL(oneapi_get_max_num_threads_per_multiprocessor, int, SyclQueue *queue) DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr) DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr) diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 300e201600c..7e90c553c44 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -904,11 +904,26 @@ size_t oneapi_get_memcapacity(SyclQueue *queue) .get_info<sycl::info::device::global_mem_size>(); } -size_t oneapi_get_compute_units_amount(SyclQueue *queue) +int oneapi_get_num_multiprocessors(SyclQueue *queue) { - return reinterpret_cast<sycl::queue *>(queue) - ->get_device() - .get_info<sycl::info::device::max_compute_units>(); + const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device(); + if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { + return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); + } + else + return 0; +} + +int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue) +{ + const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device(); + if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) && + device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { + return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() * + device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); + } + else + return 0; } #endif /* WITH_ONEAPI */ |