Cycles: make ambient occlusion pass take into account transparency again

Taking advantage of the new decoupled main and shadow paths. For CPU we just store two nested structs in the integrator state, one for direct light shadows and one for AO. For the GPU we restrict the number of shade surface states to be executed based on available space in the shadow paths queue. This also helps improve performance in benchmark scenes with an AO pass, since it is no longer needed to use the shader raytracing kernel there, which has worse performance. Differential Revision: https://developer.blender.org/D12900
author: Brecht Van Lommel <brecht@blender.org> 2021-10-17 19:08:00 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-10-20 18:50:31 +0300
commit: cccfa597ba69944817e0913944cf3c3d0a6e1165 (patch)
tree: 331fe58a76d3413bd247d745f56ba3b0f35dbeeb /intern/cycles/integrator
parent: 001f548227c413a4fdbee275744ea8bea886081a (diff)
2 files changed, 45 insertions, 21 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 18aa5dda70d..a4788c437a1 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -78,6 +78,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
       integrator_shader_raytrace_sort_counter_(
           device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_shader_sort_prefix_sum_(
+          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
       integrator_next_shadow_path_index_(
           device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
       integrator_next_shadow_catcher_path_index_(
@@ -200,6 +202,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
     integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
     integrator_shader_raytrace_sort_counter_.zero_to_device();
 
+    integrator_shader_sort_prefix_sum_.alloc(max_shaders);
+    integrator_shader_sort_prefix_sum_.zero_to_device();
+
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
         (int *)integrator_shader_sort_counter_.device_pointer;
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
@@ -374,9 +379,12 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
 
   /* For kernels that add shadow paths, check if there is enough space available.
    * If not, schedule shadow kernels first to clear out the shadow paths. */
+  int num_paths_limit = INT_MAX;
+
   if (kernel_creates_shadow_paths(kernel)) {
-    if (max_num_paths_ - integrator_next_shadow_path_index_.data()[0] <
-        queue_counter->num_queued[kernel]) {
+    const int available_shadow_paths = max_num_paths_ -
+                                       integrator_next_shadow_path_index_.data()[0];
+    if (available_shadow_paths < queue_counter->num_queued[kernel]) {
       if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
         enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
         return true;
@@ -386,10 +394,14 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
         return true;
       }
     }
+    else if (kernel_creates_ao_paths(kernel)) {
+      /* AO kernel creates two shadow paths, so limit number of states to schedule. */
+      num_paths_limit = available_shadow_paths / 2;
+    }
   }
 
   /* Schedule kernel with maximum number of queued items. */
-  enqueue_path_iteration(kernel);
+  enqueue_path_iteration(kernel, num_paths_limit);
 
   /* Update next shadow path index for kernels that can add shadow paths. */
   if (kernel_creates_shadow_paths(kernel)) {
@@ -399,7 +411,7 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
   return true;
 }
 
-void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
 {
   void *d_path_index = (void *)NULL;
 
@@ -414,7 +426,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     work_size = num_queued;
     d_path_index = (void *)queued_paths_.device_pointer;
 
-    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+    compute_sorted_queued_paths(
+        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
   }
   else if (num_queued < work_size) {
     work_size = num_queued;
@@ -430,6 +443,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     }
   }
 
+  work_size = min(work_size, num_paths_limit);
+
   DCHECK_LE(work_size, max_num_paths_);
 
   switch (kernel) {
@@ -464,17 +479,20 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
   }
 }
 
-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
+                                                   DeviceKernel queued_kernel,
+                                                   const int num_paths_limit)
 {
   int d_queued_kernel = queued_kernel;
   void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
-  assert(d_counter != nullptr);
+  void *d_prefix_sum = (void *)integrator_shader_sort_prefix_sum_.device_pointer;
+  assert(d_counter != nullptr && d_prefix_sum != nullptr);
 
   /* Compute prefix sum of number of active paths with each shader. */
   {
     const int work_size = 1;
     int max_shaders = device_scene_->data.max_shaders;
-    void *args[] = {&d_counter, &max_shaders};
+    void *args[] = {&d_counter, &d_prefix_sum, &max_shaders};
     queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
   }
 
@@ -483,29 +501,24 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
   /* Launch kernel to fill the active paths arrays. */
   {
     /* TODO: this could be smaller for terminated paths based on amount of work we want
-     * to schedule. */
+     * to schedule, and also based on num_paths_limit.
+     *
+     * Also, when the number paths is limited it may be better to prefer paths from the
+     * end of the array since compaction would need to do less work. */
     const int work_size = kernel_max_active_path_index(queued_kernel);
 
     void *d_queued_paths = (void *)queued_paths_.device_pointer;
     void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
     void *args[] = {const_cast<int *>(&work_size),
+                    const_cast<int *>(&num_paths_limit),
                     &d_queued_paths,
                     &d_num_queued_paths,
                     &d_counter,
+                    &d_prefix_sum,
                     &d_queued_kernel};
 
     queue_->enqueue(kernel, work_size, args);
   }
-
-  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
-    queue_->zero_to_device(integrator_shader_sort_counter_);
-  }
-  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
-    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
-  }
-  else {
-    assert(0);
-  }
 }
 
 void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
@@ -1026,6 +1039,13 @@ bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel)
           kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
 }
 
+bool PathTraceWorkGPU::kernel_creates_ao_paths(DeviceKernel kernel)
+{
+  return (device_scene_->data.film.pass_ao != PASS_UNUSED) &&
+         (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
 bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel)
 {
   return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index dd2c1c197ae..e1f6c09d334 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -79,10 +79,12 @@ class PathTraceWorkGPU : public PathTraceWork {
                           const int num_predicted_splits);
 
   bool enqueue_path_iteration();
-  void enqueue_path_iteration(DeviceKernel kernel);
+  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);
 
   void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel,
+                                   DeviceKernel queued_kernel,
+                                   const int num_paths_limit);
 
   void compact_states(const int num_active_paths);
 
@@ -116,6 +118,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Kernel properties. */
   bool kernel_uses_sorting(DeviceKernel kernel);
   bool kernel_creates_shadow_paths(DeviceKernel kernel);
+  bool kernel_creates_ao_paths(DeviceKernel kernel);
   bool kernel_is_shadow_path(DeviceKernel kernel);
   int kernel_max_active_path_index(DeviceKernel kernel);
 
@@ -136,6 +139,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Shader sorting. */
   device_vector<int> integrator_shader_sort_counter_;
   device_vector<int> integrator_shader_raytrace_sort_counter_;
+  device_vector<int> integrator_shader_sort_prefix_sum_;
   /* Path split. */
   device_vector<int> integrator_next_shadow_path_index_;
   device_vector<int> integrator_next_shadow_catcher_path_index_;
author	Brecht Van Lommel <brecht@blender.org>	2021-10-17 19:08:00 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-10-20 18:50:31 +0300
commit	cccfa597ba69944817e0913944cf3c3d0a6e1165 (patch)
tree	331fe58a76d3413bd247d745f56ba3b0f35dbeeb /intern/cycles/integrator
parent	001f548227c413a4fdbee275744ea8bea886081a (diff)