From 4b1d315017ef103f3034160d349b3c3c21a4cd6a Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_p_jones@apple.com>
Date: Wed, 13 Jul 2022 20:56:57 +0100
Subject: Cycles: Improve cache usage on Apple GPUs by chunking active indices

This patch partitions the active indices into chunks prior to sorting by material in order to tradeoff some material coherence for better locality. On Apple Silicon GPUs (particularly higher end M1-family GPUs), we observe overall render time speedups of up to 15%. The partitioning is implemented by repeating the range of `shader_sort_key` for each partition, and encoding a "locator" key which distributes the indices into sorted chunks.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D15331
---
 intern/cycles/kernel/integrator/state.h      |  3 +++
 intern/cycles/kernel/integrator/state_flow.h | 12 ++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'intern/cycles/kernel/integrator')

diff --git a/intern/cycles/kernel/integrator/state.h b/intern/cycles/kernel/integrator/state.h
index d6fef27f344..d10d31e930e 100644
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -127,6 +127,9 @@ typedef struct IntegratorStateGPU {
 
   /* Index of main path which will be used by a next shadow catcher split.  */
   ccl_global int *next_main_path_index;
+
+  /* Divisor used to partition active indices by locality when sorting by material.  */
+  uint sort_partition_divisor;
 } IntegratorStateGPU;
 
 /* Abstraction
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index fed74d49434..d397ef385e7 100644
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -67,9 +67,17 @@ CCL_NAMESPACE_BEGIN
         &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
     INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
 
+#  ifdef __KERNEL_SORT_PARTITIONING__
+/* Sort first by truncated state index (for good locality), then by key (for good coherence). */
+#    define INTEGRATOR_SORT_KEY(key, state) \
+      (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
+#  else
+#    define INTEGRATOR_SORT_KEY(key, state) (key)
+#  endif
+
 #  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
     { \
-      const int key_ = key; \
+      const int key_ = INTEGRATOR_SORT_KEY(key, state); \
       atomic_fetch_and_add_uint32( \
           &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
       INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
@@ -79,7 +87,7 @@ CCL_NAMESPACE_BEGIN
     }
 #  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
     { \
-      const int key_ = key; \
+      const int key_ = INTEGRATOR_SORT_KEY(key, state); \
       atomic_fetch_and_sub_uint32( \
           &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
       atomic_fetch_and_add_uint32( \
-- 
cgit v1.2.3