Cycles: Metal integrator state size tuning

This patch tunes the integrator state sizing for Metal (`num_concurrent_states` and `num_concurrent_busy_states`). On all GPUs architecture, we adjust the busy:total states ratio to be 1:4 which gives better rendering performance than the previous 1:16 ratio (independent of total state count). This gives a small performance uplift (e.g. 2-3% on M1 Ultra). Additionally for M2 architectures, we double the overall state size if there is available headroom. Inclusive of the first change, we can expect uplift of close to 10% in future, as this results in larger dispatch sizes and minimises work submission overheads. In order to make an accurate determination of available headroom, we defer the calculation of `num_concurrent_states` and `num_concurrent_busy_states` until the time of integrator state allocation (i.e. after all of the scene data has been allocated). We also refactor `alloc_integrator_soa` to calculate an *exact* single-state-size in a first pass, right before allocating the integrator SoA buffers in a second pass. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16313
author: Michael Jones <michael_p_jones@apple.com> 2022-10-24 12:23:56 +0300
committer: Michael Jones <michael_p_jones@apple.com> 2022-10-24 19:14:33 +0300
commit: 8dd7b5b26b394207b5941d49750f7e3abadaf82a (patch)
tree: fe7425665e435667fc8977cce3b32919dffd9f67 /intern/cycles/integrator
parent: 4c6e07230f76c21e67f0d5ef15464ca7d55c404a (diff)
1 files changed, 21 insertions, 10 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index ee250a6916b..48f6cf3c903 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -18,13 +18,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-static size_t estimate_single_state_size()
+static size_t estimate_single_state_size(const uint kernel_features)
 {
   size_t state_size = 0;
 
 #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
-#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+  state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+  state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
 #define KERNEL_STRUCT_END(name) \
   break; \
   }
@@ -76,16 +78,11 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
       work_tiles_(device, "work_tiles", MEM_READ_WRITE),
       display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
-      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
-      min_num_active_main_paths_(queue_->num_concurrent_busy_states()),
+      max_num_paths_(0),
+      min_num_active_main_paths_(0),
       max_active_main_path_index_(0)
 {
   memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
-
-  /* Limit number of active paths to the half of the overall state. This is due to the logic in the
-   * path compaction which relies on the fact that regeneration does not happen sooner than half of
-   * the states are available again. */
-  min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2);
 }
 
 void PathTraceWorkGPU::alloc_integrator_soa()
@@ -103,6 +100,20 @@ void PathTraceWorkGPU::alloc_integrator_soa()
   integrator_state_soa_volume_stack_size_ = max(integrator_state_soa_volume_stack_size_,
                                                 requested_volume_stack_size);
 
+  /* Deterine the number of path states. Deferring this for as long as possible allows the backend
+   * to make better decisions about memory availability. */
+  if (max_num_paths_ == 0) {
+    size_t single_state_size = estimate_single_state_size(kernel_features);
+
+    max_num_paths_ = queue_->num_concurrent_states(single_state_size);
+    min_num_active_main_paths_ = queue_->num_concurrent_busy_states(single_state_size);
+
+    /* Limit number of active paths to the half of the overall state. This is due to the logic in
+     * the path compaction which relies on the fact that regeneration does not happen sooner than
+     * half of the states are available again. */
+    min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2);
+  }
+
   /* Allocate a device only memory buffer before for each struct member, and then
    * write the pointers into a struct that resides in constant memory.
    *
author	Michael Jones <michael_p_jones@apple.com>	2022-10-24 12:23:56 +0300
committer	Michael Jones <michael_p_jones@apple.com>	2022-10-24 19:14:33 +0300
commit	8dd7b5b26b394207b5941d49750f7e3abadaf82a (patch)
tree	fe7425665e435667fc8977cce3b32919dffd9f67 /intern/cycles/integrator
parent	4c6e07230f76c21e67f0d5ef15464ca7d55c404a (diff)