Cycles oneAPI: simplify num_concurrent_states selection

The number of Execution Units and resident "threads" (simd width * threads per EUs) are now exposed and used to select the number of states using a simplified heuristic.
author: Xavier Hallade <xavier.hallade@intel.com> 2022-07-27 10:38:19 +0300
committer: Xavier Hallade <xavier.hallade@intel.com> 2022-07-27 10:45:33 +0300
commit: d706d0460c5721e2b07f18ab6354754267628130 (patch)
tree: db042aabbfb3a74a3d6f20e93ebffc854a854fa1 /intern/cycles/device/oneapi/queue.cpp
parent: 38e270ae30d97a171e72af0359d34d19a647489d (diff)
1 files changed, 7 insertions, 36 deletions
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
index 42e2408ee7a..1e822e25f1a 100644
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -36,34 +36,9 @@ OneapiDeviceQueue::~OneapiDeviceQueue()
 
 int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  int num_states;
-
-  /* TODO: implement and use get_num_multiprocessors and get_max_num_threads_per_multiprocessor. */
-  const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
-      oneapi_device_->sycl_queue());
-  if (compute_units >= 128) {
-    /* dGPU path, make sense to allocate more states, because it will be dedicated GPU memory. */
-    int base = 1024 * 1024;
-    /* linear dependency (with coefficient less that 1) from amount of compute units. */
-    num_states = (base * (compute_units / 128)) * 3 / 4;
-
-    /* Limit amount of integrator states by one quarter of device memory, because
-     * other allocations will need some space as well
-     * TODO: base this calculation on the how many states what the GPU is actually capable of
-     * running, with some headroom to improve occupancy. If the texture don't fit, offload into
-     * unified memory. */
-    size_t states_memory_size = num_states * state_size;
-    size_t device_memory_amount =
-        (oneapi_dll_.oneapi_get_memcapacity)(oneapi_device_->sycl_queue());
-    if (states_memory_size >= device_memory_amount / 4) {
-      num_states = device_memory_amount / 4 / state_size;
-    }
-  }
-  else {
-    /* iGPU path - no real need to allocate a lot of integrator states because it is shared GPU
-     * memory. */
-    num_states = 1024 * 512;
-  }
+  const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
+                              oneapi_device_->get_max_num_threads_per_multiprocessor();
+  int num_states = max(8 * max_num_threads, 65536) * 16;
 
   VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
                     << string_human_readable_size(num_states * state_size);
@@ -73,14 +48,10 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
 
 int OneapiDeviceQueue::num_concurrent_busy_states() const
 {
-  const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
-      oneapi_device_->sycl_queue());
-  if (compute_units >= 128) {
-    return 1024 * 1024;
-  }
-  else {
-    return 1024 * 512;
-  }
+  const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
+                              oneapi_device_->get_max_num_threads_per_multiprocessor();
+
+  return 4 * max(8 * max_num_threads, 65536);
 }
 
 void OneapiDeviceQueue::init_execution()
author	Xavier Hallade <xavier.hallade@intel.com>	2022-07-27 10:38:19 +0300
committer	Xavier Hallade <xavier.hallade@intel.com>	2022-07-27 10:45:33 +0300
commit	d706d0460c5721e2b07f18ab6354754267628130 (patch)
tree	db042aabbfb3a74a3d6f20e93ebffc854a854fa1 /intern/cycles/device/oneapi/queue.cpp
parent	38e270ae30d97a171e72af0359d34d19a647489d (diff)