1 files changed, 172 insertions, 101 deletions
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index fed74d49434..4b03c665e17 100644
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -10,125 +10,196 @@ CCL_NAMESPACE_BEGIN
 
 /* Control Flow
  *
- * Utilities for control flow between kernels. The implementation may differ per device
- * or even be handled on the host side. To abstract such differences, experiment with
- * different implementations and for debugging, this is abstracted using macros.
+ * Utilities for control flow between kernels. The implementation is different between CPU and
+ * GPU devices. For the latter part of the logic is handled on the host side with wavefronts.
  *
  * There is a main path for regular path tracing camera for path tracing. Shadows for next
  * event estimation branch off from this into their own path, that may be computed in
- * parallel while the main path continues.
+ * parallel while the main path continues. Additionally, shading kernels are sorted using
+ * a key for coherence.
  *
  * Each kernel on the main path must call one of these functions. These may not be called
  * multiple times from the same kernel.
  *
- * INTEGRATOR_PATH_INIT(next_kernel)
- * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel)
- * INTEGRATOR_PATH_TERMINATE(current_kernel)
+ * integrator_path_init(kg, state, next_kernel)
+ * integrator_path_next(kg, state, current_kernel, next_kernel)
+ * integrator_path_terminate(kg, state, current_kernel)
  *
  * For the shadow path similar functions are used, and again each shadow kernel must call
  * one of them, and only once.
  */
 
-#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(state, path, queued_kernel) == 0)
-#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED \
-  (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0)
+ccl_device_forceinline bool integrator_path_is_terminated(ConstIntegratorState state)
+{
+  return INTEGRATOR_STATE(state, path, queued_kernel) == 0;
+}
+
+ccl_device_forceinline bool integrator_shadow_path_is_terminated(ConstIntegratorShadowState state)
+{
+  return INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0;
+}
 
 #ifdef __KERNEL_GPU__
 
-#  define INTEGRATOR_PATH_INIT(next_kernel) \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-
-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
-    IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \
-        &kernel_integrator_state.next_shadow_path_index[0], 1); \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
-
-#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
-    { \
-      const int key_ = key; \
-      atomic_fetch_and_add_uint32( \
-          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \
-      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
-                                  1); \
-    }
-#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
-    { \
-      const int key_ = key; \
-      atomic_fetch_and_sub_uint32( \
-          &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-      atomic_fetch_and_add_uint32( \
-          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \
-      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
-                                  1); \
-    }
+ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel next_kernel)
+{
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+}
+
+ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel current_kernel,
+                                                 const DeviceKernel next_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+}
+
+ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      const DeviceKernel current_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+}
+
+ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
+    KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
+{
+  IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32(
+      &kernel_integrator_state.next_shadow_path_index[0], 1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+  return shadow_state;
+}
+
+ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
+                                                        IntegratorShadowState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+}
+
+ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
+                                                             IntegratorShadowState state,
+                                                             const DeviceKernel current_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+}
+
+/* Sort first by truncated state index (for good locality), then by key (for good coherence). */
+#  define INTEGRATOR_SORT_KEY(key, state) \
+    (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
+
+ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  const int key_ = INTEGRATOR_SORT_KEY(key, state);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
+}
+
+ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  const int key_ = INTEGRATOR_SORT_KEY(key, state);
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
+}
 
 #else
 
-#  define INTEGRATOR_PATH_INIT(next_kernel) \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      (void)key; \
-    }
-#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      (void)current_kernel; \
-    }
-#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; \
-      (void)current_kernel; \
-    }
-#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      (void)key; \
-      (void)current_kernel; \
-    }
-
-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
-    IntegratorShadowState shadow_state = &state->shadow_type; \
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; \
-      (void)current_kernel; \
-    }
-#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; \
-      (void)current_kernel; \
-    }
+ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel next_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+}
+
+ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  (void)key;
+}
+
+ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel current_kernel,
+                                                 const DeviceKernel next_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      const DeviceKernel current_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  (void)key;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
+    KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
+{
+  IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+  return shadow_state;
+}
+
+ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
+                                                        IntegratorShadowState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
+                                                             IntegratorShadowState state,
+                                                             const DeviceKernel current_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+  (void)current_kernel;
+}
 
 #endif