7 files changed, 45 insertions, 52 deletions
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index d8bb3b867cd..d1250b83d22 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -229,10 +229,6 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
     global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
   }
 
-  if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) {
-    build_options += " -D__KERNEL_SORT_PARTITIONING__ ";
-  }
-
   if (use_metalrt) {
     global_defines += "#define __METALRT__\n";
     if (motion_blur) {
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index 836289172f7..fc32740f3e1 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -24,7 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {
 
   virtual int num_concurrent_states(const size_t) const override;
   virtual int num_concurrent_busy_states() const override;
-  virtual int num_sort_partitions(const size_t) const override;
+  virtual int num_sort_partition_elements() const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 6a9cc552098..5ac63a16c61 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -293,21 +293,9 @@ int MetalDeviceQueue::num_concurrent_busy_states() const
   return result;
 }
 
-int MetalDeviceQueue::num_sort_partitions(const size_t state_size) const
+int MetalDeviceQueue::num_sort_partition_elements() const
 {
-  /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
-   * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
-   */
-  if (metal_device_->launch_params.data.max_shaders >= 300) {
-    return 1;
-  }
-
-  const int optimal_partition_elements = MetalInfo::optimal_sort_partition_elements(
-      metal_device_->mtlDevice);
-  if (optimal_partition_elements) {
-    return num_concurrent_states(state_size) / optimal_partition_elements;
-  }
-  return 1;
+  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
 }
 
 void MetalDeviceQueue::init_execution()
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index c336dc310c8..65c67c400fe 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -82,10 +82,7 @@ int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
    * sorting each partition by material. Partitioning into chunks of 65536 elements results in an
    * overall render time speedup of up to 15%. */
   if (get_device_vendor(device) == METAL_GPU_APPLE) {
-    AppleGPUArchitecture arch = get_apple_gpu_architecture(device);
-    if (arch == APPLE_M1 || arch == APPLE_M2) {
-      return 65536;
-    }
+    return 65536;
   }
   return 0;
 }
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 20308e4a106..808431af401 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -105,12 +105,11 @@ class DeviceQueue {
    * value. */
   virtual int num_concurrent_busy_states() const = 0;
 
-  /* Number of partitions within which active indices are sorted by material ID.
-   * Using more partitions lets us trade off material coherence for better integrator state fetch
-   * locality. */
-  virtual int num_sort_partitions(const size_t /*state_size*/) const
+  /* Number of elements in a partition of sorted shaders, that improves memory locality of
+   * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
+  virtual int num_sort_partition_elements() const
   {
-    return 1;
+    return 65536;
   }
 
   /* Initialize execution of kernels on this queue.
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index d51e8a28bb4..fa313f6460a 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -181,28 +181,45 @@ void PathTraceWorkGPU::alloc_integrator_queue()
 
 void PathTraceWorkGPU::alloc_integrator_sorting()
 {
+  /* Compute sort partitions, to balance between memory locality and coherence.
+   * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
+   * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
+   */
+  num_sort_partitions_ = 1;
+  if (device_scene_->data.max_shaders < 300) {
+    const int num_elements = queue_->num_sort_partition_elements();
+    if (num_elements) {
+      num_sort_partitions_ = max(max_num_paths_ / num_elements, 1);
+    }
+  }
+
+  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
+                                                                num_sort_partitions_);
+
   /* Allocate arrays for shader sorting. */
-  num_sort_partitions_ = queue_->num_sort_partitions(estimate_single_state_size());
   const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
   if (integrator_shader_sort_counter_.size() < sort_buckets) {
     integrator_shader_sort_counter_.alloc(sort_buckets);
     integrator_shader_sort_counter_.zero_to_device();
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+        (int *)integrator_shader_sort_counter_.device_pointer;
 
-    integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
-    integrator_shader_raytrace_sort_counter_.zero_to_device();
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+      integrator_shader_raytrace_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+    }
 
-    integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
-    integrator_shader_mnee_sort_counter_.zero_to_device();
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+      integrator_shader_mnee_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+    }
 
     integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
     integrator_shader_sort_prefix_sum_.zero_to_device();
-
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
-        (int *)integrator_shader_sort_counter_.device_pointer;
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-        (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-        (int *)integrator_shader_mnee_sort_counter_.device_pointer;
   }
 }
 
@@ -238,10 +255,6 @@ void PathTraceWorkGPU::init_execution()
 {
   queue_->init_execution();
 
-  /* Setup sort partitioning divisor for better cache utilization. */
-  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
-                                                                num_sort_partitions_);
-
   /* Copy to device side struct in constant memory. */
   device_->const_copy_to(
       "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
@@ -338,8 +351,12 @@ void PathTraceWorkGPU::enqueue_reset()
   queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
   queue_->zero_to_device(integrator_queue_counter_);
   queue_->zero_to_device(integrator_shader_sort_counter_);
-  queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
-  queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  }
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+    queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
+  }
 
   /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
    * counter on the host side because `zero_to_device()` is not doing it. */
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index 1ae746022d0..4b03c665e17 100644
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -99,13 +99,9 @@ ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
 }
 
-#  ifdef __KERNEL_SORT_PARTITIONING__
 /* Sort first by truncated state index (for good locality), then by key (for good coherence). */
-#    define INTEGRATOR_SORT_KEY(key, state) \
-      (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
-#  else
-#    define INTEGRATOR_SORT_KEY(key, state) (key)
-#  endif
+#  define INTEGRATOR_SORT_KEY(key, state) \
+    (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
 
 ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
                                                         IntegratorState state,