Cycles: Improve cache usage on Apple GPUs by chunking active indices

This patch partitions the active indices into chunks prior to sorting by material in order to tradeoff some material coherence for better locality. On Apple Silicon GPUs (particularly higher end M1-family GPUs), we observe overall render time speedups of up to 15%. The partitioning is implemented by repeating the range of `shader_sort_key` for each partition, and encoding a "locator" key which distributes the indices into sorted chunks. Reviewed By: brecht Differential Revision: https://developer.blender.org/D15331
author: Michael Jones <michael_p_jones@apple.com> 2022-07-13 22:56:57 +0300
committer: Michael Jones <michael_p_jones@apple.com> 2022-07-14 16:26:18 +0300
commit: 4b1d315017ef103f3034160d349b3c3c21a4cd6a (patch)
tree: 779dd8c27d37e710d3014911e962027b56049084
parent: 47d4ce498e3f5a11a0210b1efd57053f0b1c85bd (diff)
11 files changed, 82 insertions, 14 deletions
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 87c83242240..ba9317e3204 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -217,6 +217,10 @@ string MetalDevice::get_source(const uint kernel_features)
     build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
   }
 
+  if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) {
+    build_options += " -D__KERNEL_SORT_PARTITIONING__ ";
+  }
+
   if (use_metalrt) {
     build_options += "-D__METALRT__ ";
     if (motion_blur) {
@@ -652,7 +656,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
   /* Update data storage pointers in launch parameters. */
   if (strcmp(name, "integrator_state") == 0) {
     /* IntegratorStateGPU is contiguous pointers */
-    const size_t pointer_block_size = sizeof(IntegratorStateGPU);
+    const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
     update_launch_pointers(
         offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
   }
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index b0bd487c86d..836289172f7 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {
 
   virtual int num_concurrent_states(const size_t) const override;
   virtual int num_concurrent_busy_states() const override;
+  virtual int num_sort_partitions(const size_t) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 03e60b6bb6e..6a9cc552098 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -293,6 +293,23 @@ int MetalDeviceQueue::num_concurrent_busy_states() const
   return result;
 }
 
+int MetalDeviceQueue::num_sort_partitions(const size_t state_size) const
+{
+  /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
+   * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
+   */
+  if (metal_device_->launch_params.data.max_shaders >= 300) {
+    return 1;
+  }
+
+  const int optimal_partition_elements = MetalInfo::optimal_sort_partition_elements(
+      metal_device_->mtlDevice);
+  if (optimal_partition_elements) {
+    return num_concurrent_states(state_size) / optimal_partition_elements;
+  }
+  return 1;
+}
+
 void MetalDeviceQueue::init_execution()
 {
   /* Synchronize all textures and memory copies before executing task. */
@@ -359,7 +376,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
   /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
   /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
   size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
-                                        sizeof(IntegratorStateGPU);
+                                        offsetof(IntegratorStateGPU, sort_partition_divisor);
   size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
   memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
          (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
@@ -416,7 +433,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
 
   /* this relies on IntegratorStateGPU layout being contiguous device_ptrs  */
   const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
-                                   sizeof(IntegratorStateGPU);
+                                   offsetof(IntegratorStateGPU, sort_partition_divisor);
   for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
     int pointer_index = int(offset / sizeof(device_ptr));
     MetalDevice::MetalMem *mmem = *(
diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h
index fd32d8a260f..a988d01d361 100644
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@@ -37,6 +37,7 @@ struct MetalInfo {
   static int get_apple_gpu_core_count(id<MTLDevice> device);
   static MetalGPUVendor get_device_vendor(id<MTLDevice> device);
   static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device);
+  static int optimal_sort_partition_elements(id<MTLDevice> device);
   static string get_device_name(id<MTLDevice> device);
 };
 
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index a7a5b596b8f..c336dc310c8 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -72,6 +72,24 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
   return METAL_GPU_UNKNOWN;
 }
 
+int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
+{
+  if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) {
+    return atoi(str);
+  }
+
+  /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before
+   * sorting each partition by material. Partitioning into chunks of 65536 elements results in an
+   * overall render time speedup of up to 15%. */
+  if (get_device_vendor(device) == METAL_GPU_APPLE) {
+    AppleGPUArchitecture arch = get_apple_gpu_architecture(device);
+    if (arch == APPLE_M1 || arch == APPLE_M2) {
+      return 65536;
+    }
+  }
+  return 0;
+}
+
 vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
 {
   static vector<id<MTLDevice>> usable_devices;
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 14a5db3a204..20308e4a106 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -105,6 +105,14 @@ class DeviceQueue {
    * value. */
   virtual int num_concurrent_busy_states() const = 0;
 
+  /* Number of partitions within which active indices are sorted by material ID.
+   * Using more partitions lets us trade off material coherence for better integrator state fetch
+   * locality. */
+  virtual int num_sort_partitions(const size_t /*state_size*/) const
+  {
+    return 1;
+  }
+
   /* Initialize execution of kernels on this queue.
    *
    * Will, for example, load all data required by the kernels from Device to global or path state.
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 6912bf928cd..ed278821b46 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -373,7 +373,7 @@ void PathTrace::path_trace(RenderWork &render_work)
     work_balance_infos_[i].time_spent += work_time;
     work_balance_infos_[i].occupancy = statistics.occupancy;
 
-    VLOG_WORK << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+    VLOG_INFO << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
               << work_time / num_samples
               << " seconds per sample), occupancy: " << statistics.occupancy;
   });
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index e262c252ce3..d51e8a28bb4 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -182,18 +182,19 @@ void PathTraceWorkGPU::alloc_integrator_queue()
 void PathTraceWorkGPU::alloc_integrator_sorting()
 {
   /* Allocate arrays for shader sorting. */
-  const int max_shaders = device_scene_->data.max_shaders;
-  if (integrator_shader_sort_counter_.size() < max_shaders) {
-    integrator_shader_sort_counter_.alloc(max_shaders);
+  num_sort_partitions_ = queue_->num_sort_partitions(estimate_single_state_size());
+  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+  if (integrator_shader_sort_counter_.size() < sort_buckets) {
+    integrator_shader_sort_counter_.alloc(sort_buckets);
     integrator_shader_sort_counter_.zero_to_device();
 
-    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+    integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
     integrator_shader_raytrace_sort_counter_.zero_to_device();
 
-    integrator_shader_mnee_sort_counter_.alloc(max_shaders);
+    integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
     integrator_shader_mnee_sort_counter_.zero_to_device();
 
-    integrator_shader_sort_prefix_sum_.alloc(max_shaders);
+    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
     integrator_shader_sort_prefix_sum_.zero_to_device();
 
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
@@ -237,6 +238,10 @@ void PathTraceWorkGPU::init_execution()
 {
   queue_->init_execution();
 
+  /* Setup sort partitioning divisor for better cache utilization. */
+  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
+                                                                num_sort_partitions_);
+
   /* Copy to device side struct in constant memory. */
   device_->const_copy_to(
       "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
@@ -486,9 +491,9 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
   /* Compute prefix sum of number of active paths with each shader. */
   {
     const int work_size = 1;
-    int max_shaders = device_scene_->data.max_shaders;
+    int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
 
-    DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders);
+    DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets);
 
     queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
   }
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 4c10a221a30..a805258d1b5 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -156,6 +156,9 @@ class PathTraceWorkGPU : public PathTraceWork {
   bool interop_use_checked_ = false;
   bool interop_use_ = false;
 
+  /* Number of partitions to sort state indices into prior to material sort. */
+  int num_sort_partitions_;
+
   /* Maximum number of concurrent integrator states. */
   int max_num_paths_;
 
diff --git a/intern/cycles/kernel/integrator/state.h b/intern/cycles/kernel/integrator/state.h
index d6fef27f344..d10d31e930e 100644
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -127,6 +127,9 @@ typedef struct IntegratorStateGPU {
 
   /* Index of main path which will be used by a next shadow catcher split.  */
   ccl_global int *next_main_path_index;
+
+  /* Divisor used to partition active indices by locality when sorting by material.  */
+  uint sort_partition_divisor;
 } IntegratorStateGPU;
 
 /* Abstraction
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index fed74d49434..d397ef385e7 100644
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -67,9 +67,17 @@ CCL_NAMESPACE_BEGIN
         &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
     INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
 
+#  ifdef __KERNEL_SORT_PARTITIONING__
+/* Sort first by truncated state index (for good locality), then by key (for good coherence). */
+#    define INTEGRATOR_SORT_KEY(key, state) \
+      (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
+#  else
+#    define INTEGRATOR_SORT_KEY(key, state) (key)
+#  endif
+
 #  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
     { \
-      const int key_ = key; \
+      const int key_ = INTEGRATOR_SORT_KEY(key, state); \
       atomic_fetch_and_add_uint32( \
           &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
       INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
@@ -79,7 +87,7 @@ CCL_NAMESPACE_BEGIN
     }
 #  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
     { \
-      const int key_ = key; \
+      const int key_ = INTEGRATOR_SORT_KEY(key, state); \
       atomic_fetch_and_sub_uint32( \
           &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
       atomic_fetch_and_add_uint32( \
author	Michael Jones <michael_p_jones@apple.com>	2022-07-13 22:56:57 +0300
committer	Michael Jones <michael_p_jones@apple.com>	2022-07-14 16:26:18 +0300
commit	4b1d315017ef103f3034160d349b3c3c21a4cd6a (patch)
tree	779dd8c27d37e710d3014911e962027b56049084
parent	47d4ce498e3f5a11a0210b1efd57053f0b1c85bd (diff)