Cycles: make ambient occlusion pass take into account transparency again

Taking advantage of the new decoupled main and shadow paths. For CPU we just store two nested structs in the integrator state, one for direct light shadows and one for AO. For the GPU we restrict the number of shade surface states to be executed based on available space in the shadow paths queue. This also helps improve performance in benchmark scenes with an AO pass, since it is no longer needed to use the shader raytracing kernel there, which has worse performance. Differential Revision: https://developer.blender.org/D12900
author: Brecht Van Lommel <brecht@blender.org> 2021-10-17 19:08:00 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-10-20 18:50:31 +0300
commit: cccfa597ba69944817e0913944cf3c3d0a6e1165 (patch)
tree: 331fe58a76d3413bd247d745f56ba3b0f35dbeeb /intern/cycles/kernel/device
parent: 001f548227c413a4fdbee275744ea8bea886081a (diff)
3 files changed, 33 insertions, 9 deletions
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b6df74e835a..fcb398f7e6d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -282,11 +282,22 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(
-        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+    kernel_gpu_integrator_sorted_paths_array(int num_states,
+                                             int num_states_limit,
+                                             int *indices,
+                                             int *num_indices,
+                                             int *key_counter,
+                                             int *key_prefix_sum,
+                                             int kernel)
 {
   gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+      num_states,
+      num_states_limit,
+      indices,
+      num_indices,
+      key_counter,
+      key_prefix_sum,
+      [kernel](const int state) {
         return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
                    INTEGRATOR_STATE(state, path, shader_sort_key) :
                    GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
@@ -322,9 +333,10 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *values, int num_values)
+    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
+      counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index a1349e82efb..aabe6e2e27a 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,7 +33,8 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+template<uint blocksize>
+__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
 {
   if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
     return;
@@ -41,8 +42,9 @@ template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, co
 
   int offset = 0;
   for (int i = 0; i < num_values; i++) {
-    const int new_offset = offset + values[i];
-    values[i] = offset;
+    const int new_offset = offset + counter[i];
+    prefix_sum[i] = offset;
+    counter[i] = 0;
     offset = new_offset;
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 9bca1fad22f..7570c5a6bbd 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN
 
 template<uint blocksize, typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                const int num_states_limit,
                                                 int *indices,
                                                 int *num_indices,
+                                                int *key_counter,
                                                 int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
@@ -46,7 +48,15 @@ __device__ void gpu_parallel_sorted_index_array(const uint num_states,
 
   if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
     const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
-    indices[index] = state_index;
+    if (index < num_states_limit) {
+      /* Assign state index. */
+      indices[index] = state_index;
+    }
+    else {
+      /* Can't process this state now, increase the counter again so that
+       * it will be handled in another iteration. */
+      atomic_fetch_and_add_uint32(&key_counter[key], 1);
+    }
   }
 }
author	Brecht Van Lommel <brecht@blender.org>	2021-10-17 19:08:00 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-10-20 18:50:31 +0300
commit	cccfa597ba69944817e0913944cf3c3d0a6e1165 (patch)
tree	331fe58a76d3413bd247d745f56ba3b0f35dbeeb /intern/cycles/kernel/device
parent	001f548227c413a4fdbee275744ea8bea886081a (diff)