Cycles: make ambient occlusion pass take into account transparency again

Taking advantage of the new decoupled main and shadow paths. For CPU we just store two nested structs in the integrator state, one for direct light shadows and one for AO. For the GPU we restrict the number of shade surface states to be executed based on available space in the shadow paths queue. This also helps improve performance in benchmark scenes with an AO pass, since it is no longer needed to use the shader raytracing kernel there, which has worse performance. Differential Revision: https://developer.blender.org/D12900
author: Brecht Van Lommel <brecht@blender.org> 2021-10-17 19:08:00 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-10-20 18:50:31 +0300
commit: cccfa597ba69944817e0913944cf3c3d0a6e1165 (patch)
tree: 331fe58a76d3413bd247d745f56ba3b0f35dbeeb /intern
parent: 001f548227c413a4fdbee275744ea8bea886081a (diff)
18 files changed, 178 insertions, 97 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 18aa5dda70d..a4788c437a1 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -78,6 +78,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
       integrator_shader_raytrace_sort_counter_(
           device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_shader_sort_prefix_sum_(
+          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
       integrator_next_shadow_path_index_(
           device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
       integrator_next_shadow_catcher_path_index_(
@@ -200,6 +202,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
     integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
     integrator_shader_raytrace_sort_counter_.zero_to_device();
 
+    integrator_shader_sort_prefix_sum_.alloc(max_shaders);
+    integrator_shader_sort_prefix_sum_.zero_to_device();
+
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
         (int *)integrator_shader_sort_counter_.device_pointer;
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
@@ -374,9 +379,12 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
 
   /* For kernels that add shadow paths, check if there is enough space available.
    * If not, schedule shadow kernels first to clear out the shadow paths. */
+  int num_paths_limit = INT_MAX;
+
   if (kernel_creates_shadow_paths(kernel)) {
-    if (max_num_paths_ - integrator_next_shadow_path_index_.data()[0] <
-        queue_counter->num_queued[kernel]) {
+    const int available_shadow_paths = max_num_paths_ -
+                                       integrator_next_shadow_path_index_.data()[0];
+    if (available_shadow_paths < queue_counter->num_queued[kernel]) {
       if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
         enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
         return true;
@@ -386,10 +394,14 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
         return true;
       }
     }
+    else if (kernel_creates_ao_paths(kernel)) {
+      /* AO kernel creates two shadow paths, so limit number of states to schedule. */
+      num_paths_limit = available_shadow_paths / 2;
+    }
   }
 
   /* Schedule kernel with maximum number of queued items. */
-  enqueue_path_iteration(kernel);
+  enqueue_path_iteration(kernel, num_paths_limit);
 
   /* Update next shadow path index for kernels that can add shadow paths. */
   if (kernel_creates_shadow_paths(kernel)) {
@@ -399,7 +411,7 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
   return true;
 }
 
-void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
 {
   void *d_path_index = (void *)NULL;
 
@@ -414,7 +426,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     work_size = num_queued;
     d_path_index = (void *)queued_paths_.device_pointer;
 
-    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+    compute_sorted_queued_paths(
+        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
   }
   else if (num_queued < work_size) {
     work_size = num_queued;
@@ -430,6 +443,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     }
   }
 
+  work_size = min(work_size, num_paths_limit);
+
   DCHECK_LE(work_size, max_num_paths_);
 
   switch (kernel) {
@@ -464,17 +479,20 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
   }
 }
 
-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
+                                                   DeviceKernel queued_kernel,
+                                                   const int num_paths_limit)
 {
   int d_queued_kernel = queued_kernel;
   void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
-  assert(d_counter != nullptr);
+  void *d_prefix_sum = (void *)integrator_shader_sort_prefix_sum_.device_pointer;
+  assert(d_counter != nullptr && d_prefix_sum != nullptr);
 
   /* Compute prefix sum of number of active paths with each shader. */
   {
     const int work_size = 1;
     int max_shaders = device_scene_->data.max_shaders;
-    void *args[] = {&d_counter, &max_shaders};
+    void *args[] = {&d_counter, &d_prefix_sum, &max_shaders};
     queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
   }
 
@@ -483,29 +501,24 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
   /* Launch kernel to fill the active paths arrays. */
   {
     /* TODO: this could be smaller for terminated paths based on amount of work we want
-     * to schedule. */
+     * to schedule, and also based on num_paths_limit.
+     *
+     * Also, when the number paths is limited it may be better to prefer paths from the
+     * end of the array since compaction would need to do less work. */
     const int work_size = kernel_max_active_path_index(queued_kernel);
 
     void *d_queued_paths = (void *)queued_paths_.device_pointer;
     void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
     void *args[] = {const_cast<int *>(&work_size),
+                    const_cast<int *>(&num_paths_limit),
                     &d_queued_paths,
                     &d_num_queued_paths,
                     &d_counter,
+                    &d_prefix_sum,
                     &d_queued_kernel};
 
     queue_->enqueue(kernel, work_size, args);
   }
-
-  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
-    queue_->zero_to_device(integrator_shader_sort_counter_);
-  }
-  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
-    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
-  }
-  else {
-    assert(0);
-  }
 }
 
 void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
@@ -1026,6 +1039,13 @@ bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel)
           kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
 }
 
+bool PathTraceWorkGPU::kernel_creates_ao_paths(DeviceKernel kernel)
+{
+  return (device_scene_->data.film.pass_ao != PASS_UNUSED) &&
+         (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
 bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel)
 {
   return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index dd2c1c197ae..e1f6c09d334 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -79,10 +79,12 @@ class PathTraceWorkGPU : public PathTraceWork {
                           const int num_predicted_splits);
 
   bool enqueue_path_iteration();
-  void enqueue_path_iteration(DeviceKernel kernel);
+  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);
 
   void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel,
+                                   DeviceKernel queued_kernel,
+                                   const int num_paths_limit);
 
   void compact_states(const int num_active_paths);
 
@@ -116,6 +118,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Kernel properties. */
   bool kernel_uses_sorting(DeviceKernel kernel);
   bool kernel_creates_shadow_paths(DeviceKernel kernel);
+  bool kernel_creates_ao_paths(DeviceKernel kernel);
   bool kernel_is_shadow_path(DeviceKernel kernel);
   int kernel_max_active_path_index(DeviceKernel kernel);
 
@@ -136,6 +139,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Shader sorting. */
   device_vector<int> integrator_shader_sort_counter_;
   device_vector<int> integrator_shader_raytrace_sort_counter_;
+  device_vector<int> integrator_shader_sort_prefix_sum_;
   /* Path split. */
   device_vector<int> integrator_next_shadow_path_index_;
   device_vector<int> integrator_next_shadow_catcher_path_index_;
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b6df74e835a..fcb398f7e6d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -282,11 +282,22 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(
-        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+    kernel_gpu_integrator_sorted_paths_array(int num_states,
+                                             int num_states_limit,
+                                             int *indices,
+                                             int *num_indices,
+                                             int *key_counter,
+                                             int *key_prefix_sum,
+                                             int kernel)
 {
   gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+      num_states,
+      num_states_limit,
+      indices,
+      num_indices,
+      key_counter,
+      key_prefix_sum,
+      [kernel](const int state) {
         return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
                    INTEGRATOR_STATE(state, path, shader_sort_key) :
                    GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
@@ -322,9 +333,10 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *values, int num_values)
+    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
+      counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index a1349e82efb..aabe6e2e27a 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,7 +33,8 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+template<uint blocksize>
+__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
 {
   if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
     return;
@@ -41,8 +42,9 @@ template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, co
 
   int offset = 0;
   for (int i = 0; i < num_values; i++) {
-    const int new_offset = offset + values[i];
-    values[i] = offset;
+    const int new_offset = offset + counter[i];
+    prefix_sum[i] = offset;
+    counter[i] = 0;
     offset = new_offset;
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 9bca1fad22f..7570c5a6bbd 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN
 
 template<uint blocksize, typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                const int num_states_limit,
                                                 int *indices,
                                                 int *num_indices,
+                                                int *key_counter,
                                                 int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
@@ -46,7 +48,15 @@ __device__ void gpu_parallel_sorted_index_array(const uint num_states,
 
   if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
     const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
-    indices[index] = state_index;
+    if (index < num_states_limit) {
+      /* Assign state index. */
+      indices[index] = state_index;
+    }
+    else {
+      /* Can't process this state now, increase the counter again so that
+       * it will be handled in another iteration. */
+      atomic_fetch_and_add_uint32(&key_counter[key], 1);
+    }
   }
 }
 
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
index 9bc115150ff..de916be24e7 100644
--- a/intern/cycles/kernel/integrator/integrator_init_from_bake.h
+++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
@@ -185,7 +185,7 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
     /* Setup next kernel to execute. */
     const int shader_index = shader & SHADER_MASK;
     const int shader_flags = kernel_tex_fetch(__shaders, shader_index).flags;
-    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    if (shader_flags & SD_HAS_RAYTRACE) {
       INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
     }
     else {
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
index ef8dcb50115..c1315d48694 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -111,8 +111,7 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel(
    * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
    * the matte path. */
 
-  const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) ||
-                                    (kernel_data.film.pass_ao != PASS_UNUSED));
+  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
 
   if (use_raytrace_kernel) {
     INTEGRATOR_PATH_NEXT_SORTED(
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h
index 6e3220aa3b7..21a483a792b 100644
--- a/intern/cycles/kernel/integrator/integrator_megakernel.h
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@@ -34,16 +34,12 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
                                       ccl_global float *ccl_restrict render_buffer)
 {
   /* Each kernel indicates the next kernel to execute, so here we simply
-   * have to check what that kernel is and execute it.
-   *
-   * TODO: investigate if we can use device side enqueue for GPUs to avoid
-   * having to compile this big kernel. */
+   * have to check what that kernel is and execute it. */
   while (true) {
+    /* Handle any shadow paths before we potentially create more shadow paths. */
     const uint32_t shadow_queued_kernel = INTEGRATOR_STATE(
         &state->shadow, shadow_path, queued_kernel);
-
     if (shadow_queued_kernel) {
-      /* First handle any shadow paths before we potentially create more shadow paths. */
       switch (shadow_queued_kernel) {
         case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
           integrator_intersect_shadow(kg, &state->shadow);
@@ -55,10 +51,30 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
           kernel_assert(0);
           break;
       }
+      continue;
+    }
+
+    /* Handle any AO paths before we potentially create more AO paths. */
+    const uint32_t ao_queued_kernel = INTEGRATOR_STATE(&state->ao, shadow_path, queued_kernel);
+    if (ao_queued_kernel) {
+      switch (ao_queued_kernel) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+          integrator_intersect_shadow(kg, &state->ao);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+          integrator_shade_shadow(kg, &state->ao, render_buffer);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+      continue;
     }
-    else if (INTEGRATOR_STATE(state, path, queued_kernel)) {
-      /* Then handle regular path kernels. */
-      switch (INTEGRATOR_STATE(state, path, queued_kernel)) {
+
+    /* Then handle regular path kernels. */
+    const uint32_t queued_kernel = INTEGRATOR_STATE(state, path, queued_kernel);
+    if (queued_kernel) {
+      switch (queued_kernel) {
         case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
           integrator_intersect_closest(kg, state);
           break;
@@ -87,10 +103,10 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
           kernel_assert(0);
           break;
       }
+      continue;
     }
-    else {
-      break;
-    }
+
+    break;
   }
 }
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h
index d98e53e6bbf..287c54d7243 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_background.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_background.h
@@ -198,7 +198,7 @@ ccl_device void integrator_shade_background(KernelGlobals kg,
     const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type);
     const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
 
-    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    if (shader_flags & SD_HAS_RAYTRACE) {
       INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
                                   DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
                                   shader);
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
index 0108ba1373c..2fb0dcc2097 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -168,7 +168,8 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   const bool is_light = light_sample_is_light(&ls);
 
   /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+  INTEGRATOR_SHADOW_PATH_INIT(
+      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
 
   /* Copy volume stack and enter/exit volume. */
   integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
@@ -324,26 +325,14 @@ ccl_device_forceinline bool integrate_surface_volume_only_bounce(IntegratorState
 }
 #endif
 
-#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+#if defined(__AO__)
 ccl_device_forceinline void integrate_surface_ao_pass(
     KernelGlobals kg,
-    ConstIntegratorState state,
-    ccl_private const ShaderData *ccl_restrict sd,
-    ccl_private const RNGState *ccl_restrict rng_state,
-    ccl_global float *ccl_restrict render_buffer)
-{
-#  ifdef __KERNEL_OPTIX__
-  optixDirectCall<void>(2, kg, state, sd, rng_state, render_buffer);
-}
-
-extern "C" __device__ void __direct_callable__ao_pass(
-    KernelGlobals kg,
-    ConstIntegratorState state,
+    IntegratorState state,
     ccl_private const ShaderData *ccl_restrict sd,
     ccl_private const RNGState *ccl_restrict rng_state,
     ccl_global float *ccl_restrict render_buffer)
 {
-#  endif /* __KERNEL_OPTIX__ */
   float bsdf_u, bsdf_v;
   path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
@@ -352,24 +341,48 @@ extern "C" __device__ void __direct_callable__ao_pass(
   float ao_pdf;
   sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-    Ray ray ccl_optional_struct_init;
-    ray.P = ray_offset(sd->P, sd->Ng);
-    ray.D = ao_D;
-    ray.t = kernel_data.integrator.ao_bounces_distance;
-    ray.time = sd->time;
-    ray.dP = differential_zero_compact();
-    ray.dD = differential_zero_compact();
-
-    Intersection isect ccl_optional_struct_init;
-    if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
-      ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
-      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput);
-    }
+  if (!(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f)) {
+    return;
   }
+
+  Ray ray ccl_optional_struct_init;
+  ray.P = ray_offset(sd->P, sd->Ng);
+  ray.D = ao_D;
+  ray.t = kernel_data.integrator.ao_bounces_distance;
+  ray.time = sd->time;
+  ray.dP = differential_zero_compact();
+  ray.dD = differential_zero_compact();
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, ao);
+
+  /* Copy volume stack and enter/exit volume. */
+  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(kg, shadow_state, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag) | PATH_RAY_SHADOW_FOR_AO;
+  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * shader_bsdf_alpha(kg, sd);
+
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
+      state, path, render_pixel_index);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_offset) =
+      INTEGRATOR_STATE(state, path, rng_offset) -
+      PRNG_BOUNCE_NUM * INTEGRATOR_STATE(state, path, transparent_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_hash) = INTEGRATOR_STATE(
+      state, path, rng_hash);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, sample) = INTEGRATOR_STATE(
+      state, path, sample);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput;
 }
-#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */
+#endif /* defined(__AO__) */
 
 template<uint node_feature_mask>
 ccl_device bool integrate_surface(KernelGlobals kg,
@@ -474,14 +487,12 @@ ccl_device bool integrate_surface(KernelGlobals kg,
     PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
     integrate_surface_direct_light(kg, state, &sd, &rng_state);
 
-#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+#if defined(__AO__)
     /* Ambient occlusion pass. */
-    if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) {
-      if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
-          (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
-        PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
-        integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer);
-      }
+    if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
+        (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
+      PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
+      integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer);
     }
 #endif
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
index 13a5e7bda05..1dd701237a8 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -776,7 +776,8 @@ ccl_device_forceinline void integrate_volume_direct_light(
   const bool is_light = light_sample_is_light(ls);
 
   /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+  INTEGRATOR_SHADOW_PATH_INIT(
+      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
 
   /* Write shadow ray and associated state to global memory. */
   integrator_state_write_shadow_ray(kg, shadow_state, &ray);
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
index 4f21ab35d1f..84efcb00349 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -92,6 +92,7 @@ typedef struct IntegratorStateCPU {
 #undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 
   IntegratorShadowStateCPU shadow;
+  IntegratorShadowStateCPU ao;
 } IntegratorStateCPU;
 
 /* Path Queue
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h
index df8fb5e0e46..1569bf68e24 100644
--- a/intern/cycles/kernel/integrator/integrator_state_flow.h
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@@ -63,7 +63,7 @@ CCL_NAMESPACE_BEGIN
         &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
     INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
 
-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
     IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \
         &kernel_integrator_state.next_shadow_path_index[0], 1); \
     atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
@@ -129,8 +129,8 @@ CCL_NAMESPACE_BEGIN
       (void)current_kernel; \
     }
 
-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
-    IntegratorShadowState shadow_state = &state->shadow; \
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
+    IntegratorShadowState shadow_state = &state->shadow_type; \
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
 #  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
     { \
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h
index e9517a82453..e3bf9db80f7 100644
--- a/intern/cycles/kernel/integrator/integrator_subsurface.h
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@@ -182,7 +182,7 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
 
   const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
   const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
-  if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+  if (shader_flags & SD_HAS_RAYTRACE) {
     INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                 DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
                                 shader);
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 848aaa18aae..54492bef974 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -408,6 +408,13 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
   const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
   const int sample = INTEGRATOR_STATE(state, shadow_path, sample);
 
+  /* Ambient occlusion. */
+  if (path_flag & PATH_RAY_SHADOW_FOR_AO) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, contribution);
+    return;
+  }
+
+  /* Direct light shadow. */
   kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);
 
 #ifdef __PASSES__
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index fa8de14916e..428eb3498f7 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -28,6 +28,7 @@ ccl_device_inline void path_state_init_queues(IntegratorState state)
   INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
 #ifdef __KERNEL_CPU__
   INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(&state->ao, shadow_path, queued_kernel) = 0;
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index fa8453b99cb..4bdd8185ca6 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -285,21 +285,22 @@ enum PathRayFlag {
   PATH_RAY_VOLUME_PASS = (1U << 26U),
   PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),
 
-  /* Shadow ray is for a light or surface. */
+  /* Shadow ray is for a light or surface, or AO. */
   PATH_RAY_SHADOW_FOR_LIGHT = (1U << 27U),
+  PATH_RAY_SHADOW_FOR_AO = (1U << 28U),
 
   /* A shadow catcher object was hit and the path was split into two. */
-  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 28U),
+  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 29U),
 
   /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
    * their dedicated pass for later division.
    *
    * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
    * which is separate from the light passes. */
-  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 29U),
+  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 30U),
 
   /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
-  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 30U),
+  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 31U),
 };
 
 /* Configure ray visibility bits for rays and objects respectively,
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 48f87ea3bf7..381f794545a 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -677,10 +677,6 @@ uint Film::get_kernel_features(const Scene *scene) const
         kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
       }
     }
-
-    if (pass_type == PASS_AO) {
-      kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE;
-    }
   }
 
   return kernel_features;
author	Brecht Van Lommel <brecht@blender.org>	2021-10-17 19:08:00 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-10-20 18:50:31 +0300
commit	cccfa597ba69944817e0913944cf3c3d0a6e1165 (patch)
tree	331fe58a76d3413bd247d745f56ba3b0f35dbeeb /intern
parent	001f548227c413a4fdbee275744ea8bea886081a (diff)