Cycles: Split kernel - sort shaders

Reduce thread divergence in kernel_shader_eval. Rays are sorted in blocks of 2048 according to shader->id. On R9 290 Classroom is ~30% faster, and Pabellon Barcelone is ~8% faster. No sorting for CUDA split kernel. Reviewers: sergey, maiself Reviewed By: maiself Differential Revision: https://developer.blender.org/D2598
author: Hristo Gueorguiev <prem.nirved@gmail.com> 2017-05-03 16:30:45 +0300
committer: Hristo Gueorguiev <prem.nirved@gmail.com> 2017-05-03 16:30:45 +0300
commit: 6bf4115c13962c99d1cdc97f2be92c4922f3fd33 (patch)
tree: 569c512a242caf2ea4465f2eef561933ed937a2f /intern/cycles/kernel/kernel_types.h
parent: 6f9c839f444f92c4b0c336a6f5e31cb9660d7dbc (diff)
1 files changed, 15 insertions, 0 deletions
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 6417f621c8f..9b354457b91 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN
 #  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
 #endif
 
+
+#define SHADER_SORT_BLOCK_SIZE 2048
+
+#ifdef __KERNEL_OPENCL__
+#  define SHADER_SORT_LOCAL_SIZE 64
+#elif defined(__KERNEL_CUDA__)
+#  define SHADER_SORT_LOCAL_SIZE 32
+#else
+#  define SHADER_SORT_LOCAL_SIZE 1
+#endif
+
+
 /* device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
@@ -1321,6 +1333,9 @@ enum QueueNumber {
 	 */
 	QUEUE_SHADOW_RAY_CAST_DL_RAYS,
 
+	/* Rays sorted according to shader->id */
+	QUEUE_SHADER_SORTED_RAYS,
+
 #ifdef __BRANCHED_PATH__
 	/* All rays moving to next iteration of the indirect loop for light */
 	QUEUE_LIGHT_INDIRECT_ITER,
author	Hristo Gueorguiev <prem.nirved@gmail.com>	2017-05-03 16:30:45 +0300
committer	Hristo Gueorguiev <prem.nirved@gmail.com>	2017-05-03 16:30:45 +0300
commit	6bf4115c13962c99d1cdc97f2be92c4922f3fd33 (patch)
tree	569c512a242caf2ea4465f2eef561933ed937a2f /intern/cycles/kernel/kernel_types.h
parent	6f9c839f444f92c4b0c336a6f5e31cb9660d7dbc (diff)