From 4b1d315017ef103f3034160d349b3c3c21a4cd6a Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Wed, 13 Jul 2022 20:56:57 +0100 Subject: Cycles: Improve cache usage on Apple GPUs by chunking active indices This patch partitions the active indices into chunks prior to sorting by material in order to tradeoff some material coherence for better locality. On Apple Silicon GPUs (particularly higher end M1-family GPUs), we observe overall render time speedups of up to 15%. The partitioning is implemented by repeating the range of `shader_sort_key` for each partition, and encoding a "locator" key which distributes the indices into sorted chunks. Reviewed By: brecht Differential Revision: https://developer.blender.org/D15331 --- intern/cycles/device/metal/queue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'intern/cycles/device/metal/queue.h') diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index b0bd487c86d..836289172f7 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t) const override; virtual int num_concurrent_busy_states() const override; + virtual int num_sort_partitions(const size_t) const override; virtual void init_execution() override; -- cgit v1.2.3 From 523bbf7065547a67e7c23f67f546a5ed6433f809 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Thu, 14 Jul 2022 16:42:43 +0200 Subject: Cycles: generalize shader sorting / locality heuristic to all GPU devices This was added for Metal, but also gives good results with CUDA and OptiX. Also enable it for future Apple GPUs instead of only M1 and M2, since this has been shown to help across multiple GPUs so the better bet seems to enable rather than disable it. Also moves some of the logic outside of the Metal device code, and always enables the code in the kernel since other devices don't do dynamic compile. Time per sample with OptiX + RTX A6000: new old barbershop_interior 0.0730s 0.0727s bmw27 0.0047s 0.0053s classroom 0.0428s 0.0464s fishy_cat 0.0102s 0.0108s junkshop 0.0366s 0.0395s koro 0.0567s 0.0578s monster 0.0206s 0.0223s pabellon 0.0158s 0.0174s sponza 0.0088s 0.0100s spring 0.1267s 0.1280s victor 0.0524s 0.0531s wdas_cloud 0.0817s 0.0816s Ref D15331, T87836 --- intern/cycles/device/metal/queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'intern/cycles/device/metal/queue.h') diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 836289172f7..fc32740f3e1 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -24,7 +24,7 @@ class MetalDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t) const override; virtual int num_concurrent_busy_states() const override; - virtual int num_sort_partitions(const size_t) const override; + virtual int num_sort_partition_elements() const override; virtual void init_execution() override; -- cgit v1.2.3