From 4b1d315017ef103f3034160d349b3c3c21a4cd6a Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_p_jones@apple.com>
Date: Wed, 13 Jul 2022 20:56:57 +0100
Subject: Cycles: Improve cache usage on Apple GPUs by chunking active indices

This patch partitions the active indices into chunks prior to sorting by material in order to tradeoff some material coherence for better locality. On Apple Silicon GPUs (particularly higher end M1-family GPUs), we observe overall render time speedups of up to 15%. The partitioning is implemented by repeating the range of `shader_sort_key` for each partition, and encoding a "locator" key which distributes the indices into sorted chunks.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D15331
---
 intern/cycles/device/metal/queue.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'intern/cycles/device/metal/queue.h')

diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index b0bd487c86d..836289172f7 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {
 
   virtual int num_concurrent_states(const size_t) const override;
   virtual int num_concurrent_busy_states() const override;
+  virtual int num_sort_partitions(const size_t) const override;
 
   virtual void init_execution() override;
 
-- 
cgit v1.2.3


From 523bbf7065547a67e7c23f67f546a5ed6433f809 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Thu, 14 Jul 2022 16:42:43 +0200
Subject: Cycles: generalize shader sorting / locality heuristic to all GPU
 devices

This was added for Metal, but also gives good results with CUDA and OptiX.
Also enable it for future Apple GPUs instead of only M1 and M2, since this has
been shown to help across multiple GPUs so the better bet seems to enable
rather than disable it.

Also moves some of the logic outside of the Metal device code, and always
enables the code in the kernel since other devices don't do dynamic compile.

Time per sample with OptiX + RTX A6000:
                                         new                  old
barbershop_interior                      0.0730s              0.0727s
bmw27                                    0.0047s              0.0053s
classroom                                0.0428s              0.0464s
fishy_cat                                0.0102s              0.0108s
junkshop                                 0.0366s              0.0395s
koro                                     0.0567s              0.0578s
monster                                  0.0206s              0.0223s
pabellon                                 0.0158s              0.0174s
sponza                                   0.0088s              0.0100s
spring                                   0.1267s              0.1280s
victor                                   0.0524s              0.0531s
wdas_cloud                               0.0817s              0.0816s

Ref D15331, T87836
---
 intern/cycles/device/metal/queue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'intern/cycles/device/metal/queue.h')

diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index 836289172f7..fc32740f3e1 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -24,7 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {
 
   virtual int num_concurrent_states(const size_t) const override;
   virtual int num_concurrent_busy_states() const override;
-  virtual int num_sort_partitions(const size_t) const override;
+  virtual int num_sort_partition_elements() const override;
 
   virtual void init_execution() override;
 
-- 
cgit v1.2.3