18 files changed, 178 insertions, 97 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 18aa5dda70d..a4788c437a1 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -78,6 +78,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
       integrator_shader_raytrace_sort_counter_(
           device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_shader_sort_prefix_sum_(
+          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
       integrator_next_shadow_path_index_(
           device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
       integrator_next_shadow_catcher_path_index_(
@@ -200,6 +202,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
     integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
     integrator_shader_raytrace_sort_counter_.zero_to_device();
 
+    integrator_shader_sort_prefix_sum_.alloc(max_shaders);
+    integrator_shader_sort_prefix_sum_.zero_to_device();
+
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
         (int *)integrator_shader_sort_counter_.device_pointer;
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
@@ -374,9 +379,12 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
 
   /* For kernels that add shadow paths, check if there is enough space available.
    * If not, schedule shadow kernels first to clear out the shadow paths. */
+  int num_paths_limit = INT_MAX;
+
   if (kernel_creates_shadow_paths(kernel)) {
-    if (max_num_paths_ - integrator_next_shadow_path_index_.data()[0] <
-        queue_counter->num_queued[kernel]) {
+    const int available_shadow_paths = max_num_paths_ -
+                                       integrator_next_shadow_path_index_.data()[0];
+    if (available_shadow_paths < queue_counter->num_queued[kernel]) {
       if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
         enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
         return true;
@@ -386,10 +394,14 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
         return true;
       }
     }
+    else if (kernel_creates_ao_paths(kernel)) {
+      /* AO kernel creates two shadow paths, so limit number of states to schedule. */
+      num_paths_limit = available_shadow_paths / 2;
+    }
   }
 
   /* Schedule kernel with maximum number of queued items. */
-  enqueue_path_iteration(kernel);
+  enqueue_path_iteration(kernel, num_paths_limit);
 
   /* Update next shadow path index for kernels that can add shadow paths. */
   if (kernel_creates_shadow_paths(kernel)) {
@@ -399,7 +411,7 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
   return true;
 }
 
-void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
 {
   void *d_path_index = (void *)NULL;
 
@@ -414,7 +426,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     work_size = num_queued;
     d_path_index = (void *)queued_paths_.device_pointer;
 
-    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+    compute_sorted_queued_paths(
+        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
   }
   else if (num_queued < work_size) {
     work_size = num_queued;
@@ -430,6 +443,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     }
   }
 
+  work_size = min(work_size, num_paths_limit);
+
   DCHECK_LE(work_size, max_num_paths_);
 
   switch (kernel) {
@@ -464,17 +479,20 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
   }
 }
 
-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
+                                                   DeviceKernel queued_kernel,
+                                                   const int num_paths_limit)
 {
   int d_queued_kernel = queued_kernel;
   void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
-  assert(d_counter != nullptr);
+  void *d_prefix_sum = (void *)integrator_shader_sort_prefix_sum_.device_pointer;
+  assert(d_counter != nullptr && d_prefix_sum != nullptr);
 
   /* Compute prefix sum of number of active paths with each shader. */
   {
     const int work_size = 1;
     int max_shaders = device_scene_->data.max_shaders;
-    void *args[] = {&d_counter, &max_shaders};
+    void *args[] = {&d_counter, &d_prefix_sum, &max_shaders};
     queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
   }
 
@@ -483,29 +501,24 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
   /* Launch kernel to fill the active paths arrays. */
   {
     /* TODO: this could be smaller for terminated paths based on amount of work we want
-     * to schedule. */
+     * to schedule, and also based on num_paths_limit.
+     *
+     * Also, when the number paths is limited it may be better to prefer paths from the
+     * end of the array since compaction would need to do less work. */
     const int work_size = kernel_max_active_path_index(queued_kernel);
 
     void *d_queued_paths = (void *)queued_paths_.device_pointer;
     void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
     void *args[] = {const_cast<int *>(&work_size),
+                    const_cast<int *>(&num_paths_limit),
                     &d_queued_paths,
                     &d_num_queued_paths,
                     &d_counter,
+                    &d_prefix_sum,
                     &d_queued_kernel};
 
     queue_->enqueue(kernel, work_size, args);
   }
-
-  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
-    queue_->zero_to_device(integrator_shader_sort_counter_);
-  }
-  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
-    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
-  }
-  else {
-    assert(0);
-  }
 }
 
 void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
@@ -1026,6 +1039,13 @@ bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel)
           kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
 }
 
+bool PathTraceWorkGPU::kernel_creates_ao_paths(DeviceKernel kernel)
+{
+  return (device_scene_->data.film.pass_ao != PASS_UNUSED) &&
+         (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
 bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel)
 {
   return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index dd2c1c197ae..e1f6c09d334 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -79,10 +79,12 @@ class PathTraceWorkGPU : public PathTraceWork {
                           const int num_predicted_splits);
 
   bool enqueue_path_iteration();
-  void enqueue_path_iteration(DeviceKernel kernel);
+  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);
 
   void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel,
+                                   DeviceKernel queued_kernel,
+                                   const int num_paths_limit);
 
   void compact_states(const int num_active_paths);
 
@@ -116,6 +118,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Kernel properties. */
   bool kernel_uses_sorting(DeviceKernel kernel);
   bool kernel_creates_shadow_paths(DeviceKernel kernel);
+  bool kernel_creates_ao_paths(DeviceKernel kernel);
   bool kernel_is_shadow_path(DeviceKernel kernel);
   int kernel_max_active_path_index(DeviceKernel kernel);
 
@@ -136,6 +139,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Shader sorting. */
   device_vector<int> integrator_shader_sort_counter_;
   device_vector<int> integrator_shader_raytrace_sort_counter_;
+  device_vector<int> integrator_shader_sort_prefix_sum_;
   /* Path split. */
   device_vector<int> integrator_next_shadow_path_index_;
   device_vector<int> integrator_next_shadow_catcher_path_index_;
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b6df74e835a..fcb398f7e6d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -282,11 +282,22 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(
-        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+    kernel_gpu_integrator_sorted_paths_array(int num_states,
+                                             int num_states_limit,
+                                             int *indices,
+                                             int *num_indices,
+                                             int *key_counter,
+                                             int *key_prefix_sum,
+                                             int kernel)
 {
   gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+      num_states,
+      num_states_limit,
+      indices,
+      num_indices,
+      key_counter,
+      key_prefix_sum,
+      [kernel](const int state) {
         return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
                    INTEGRATOR_STATE(state, path, shader_sort_key) :
                    GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
@@ -322,9 +333,10 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *values, int num_values)
+    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
+      counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index a1349e82efb..aabe6e2e27a 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,7 +33,8 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+template<uint blocksize>
+__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
 {
   if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
     return;
@@ -41,8 +42,9 @@ template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, co
 
   int offset = 0;
   for (int i = 0; i < num_values; i++) {
-    const int new_offset = offset + values[i];
-    values[i] = offset;
+    const int new_offset = offset + counter[i];
+    prefix_sum[i] = offset;
+    counter[i] = 0;
     offset = new_offset;
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 9bca1fad22f..7570c5a6bbd 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN
 
 template<uint blocksize, typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                const int num_states_limit,
                                                 int *indices,
                                                 int *num_indices,
+                                                int *key_counter,
                                                 int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
@@ -46,7 +48,15 @@ __device__ void gpu_parallel_sorted_index_array(const uint num_states,
 
   if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
     const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
-    indices[index] = state_index;
+    if (index < num_states_limit) {
+      /* Assign state index. */
+      indices[index] = state_index;
+    }
+    else {
+      /* Can't process this state now, increase the counter again so that
+       * it will be handled in another iteration. */
+      atomic_fetch_and_add_uint32(&key_counter[key], 1);
+    }
   }
 }
 
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
index 9bc115150ff..de916be24e7 100644
--- a/intern/cycles/kernel/integrator/integrator_init_from_bake.h
+++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
@@ -185,7 +185,7 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
     /* Setup next kernel to execute. */
     const int shader_index = shader & SHADER_MASK;
     const int shader_flags = kernel_tex_fetch(__shaders, shader_index).flags;
-    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    if (shader_flags & SD_HAS_RAYTRACE) {
       INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
     }
     else {
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
index ef8dcb50115..c1315d48694 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -111,8 +111,7 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel(
    * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
    * the matte path. */
 
-  const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) ||
-                                    (kernel_data.film.pass_ao != PASS_UNUSED));
+  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
 
   if (use_raytrace_kernel) {
     INTEGRATOR_PATH_NEXT_SORTED(
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h
index 6e3220aa3b7..21a483a792b 100644
--- a/intern/cycles/kernel/integrator/integrator_megakernel.h
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@@ -34,16 +34,12 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
                                       ccl_global float *ccl_restrict render_buffer)
 {
   /* Each kernel indicates the next kernel to execute, so here we simply
-   * have to check what that kernel is and execute it.
-   *
-   * TODO: investigate if we can use device side enqueue for GPUs to avoid
-   * having to compile this big kernel. */
+   * have to check what that kernel is and execute it. */
   while (true) {
+    /* Handle any shadow paths before we potentially create more shadow paths. */
     const uint32_t shadow_queued_kernel = INTEGRATOR_STATE(
         &state->shadow, shadow_path, queued_kernel);
-
     if (shadow_queued_kernel) {
-      /* First handle any shadow paths before we potentially create more shadow paths. */
       switch (shadow_queued_kernel) {
         case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
           integrator_intersect_shadow(kg, &state->shadow);
@@ -55,10 +51,30 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
           kernel_assert(0);
           break;
       }
+      continue;
+    }
+
+    /* Handle any AO paths before we potentially create more AO paths. */
+    const uint32_t ao_queued_kernel = INTEGRATOR_STATE(&state->ao, shadow_path, queued_kernel);
+    if (ao_queued_kernel) {
+      switch (ao_queued_kernel) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+          integrator_intersect_shadow(kg, &state->ao);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+          integrator_shade_shadow(kg, &state->ao, render_buffer);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+      continue;
     }
-    else if (INTEGRATOR_STATE(state, path, queued_kernel)) {
-      /* Then handle regular path kernels. */
-      switch (INTEGRATOR_STATE(state, path, queued_kernel)) {
+
+    /* Then handle regular path kernels. */
+    const uint32_t queued_kernel = INTEGRATOR_STATE(state, path, queued_kernel);
+    if (queued_kernel) {
+      switch (queued_kernel) {
         case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
           integrator_intersect_closest(kg, state);
           break;
@@ -87,10 +103,10 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
           kernel_assert(0);
           break;
       }
+      continue;
     }
-    else {
-      break;
-    }
+
+    break;
   }
 }
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h
index d98e53e6bbf..287c54d7243 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_background.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_background.h
@@ -198,7 +198,7 @@ ccl_device void integrator_shade_background(KernelGlobals kg,
     const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type);
     const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
 
-    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    if (shader_flags & SD_HAS_RAYTRACE) {
       INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
                                   DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
                                   shader);
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
index 0108ba1373c..2fb0dcc2097 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -168,7 +168,8 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   const bool is_light = light_sample_is_light(&ls);
 
   /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+  INTEGRATOR_SHADOW_PATH_INIT(
+      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
 
   /* Copy volume stack and enter/exit volume. */
   integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
@@ -324,26 +325,14 @@ ccl_device_forceinline bool integrate_surface_volume_only_bounce(IntegratorState
 }
 #endif
 
-#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+#if defined(__AO__)
 ccl_device_forceinline void integrate_surface_ao_pass(
     KernelGlobals kg,
-    ConstIntegratorState state,
-    ccl_private const ShaderData *ccl_restrict sd,
-    ccl_private const RNGState *ccl_restrict rng_state,
-    ccl_global float *ccl_restrict render_buffer)
-{
-#  ifdef __KERNEL_OPTIX__
-  optixDirectCall<void>(2, kg, state, sd, rng_state, render_buffer);
-}
-
-extern "C" __device__ void __direct_callable__ao_pass(
-    KernelGlobals kg,
-    ConstIntegratorState state,
+    IntegratorState state,
     ccl_private const ShaderData *ccl_restrict sd,
     ccl_private const RNGState *ccl_restrict rng_state,
     ccl_global float *ccl_restrict render_buffer)
 {
-#  endif /* __KERNEL_OPTIX__ */
   float bsdf_u, bsdf_v;
   path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
@@ -352,24 +341,48 @@ extern "C" __device__ void __direct_callable__ao_pass(
   float ao_pdf;
   sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-    Ray ray ccl_optional_struct_init;
-    ray.P = ray_offset(sd->P, sd->Ng);
-    ray.D = ao_D;
-    ray.t = kernel_data.integrator.ao_bounces_distance;
-    ray.time = sd->time;
-    ray.dP = differential_zero_compact();
-    ray.dD = differential_zero_compact();
-
-    Intersection isect ccl_optional_struct_init;
-    if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
-      ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
-      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput);
-    }
+  if (!(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f)) {
+    return;
   }
+
+  Ray ray ccl_optional_struct_init;
+  ray.P = ray_offset(sd->P, sd->Ng);
+  ray.D = ao_D;
+  ray.t = kernel_data.integrator.ao_bounces_distance;
+  ray.time = sd->time;
+  ray.dP = differential_zero_compact();
+  ray.dD = differential_zero_compact();
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, ao);
+
+  /* Copy volume stack and enter/exit volume. */
+  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(kg, shadow_state, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag) | PATH_RAY_SHADOW_FOR_AO;
+  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * shader_bsdf_alpha(kg, sd);
+
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
+      state, path, render_pixel_index);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_offset) =
+      INTEGRATOR_STATE(state, path, rng_offset) -
+      PRNG_BOUNCE_NUM * INTEGRATOR_STATE(state, path, transparent_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_hash) = INTEGRATOR_STATE(
+      state, path, rng_hash);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, sample) = INTEGRATOR_STATE(
+      state, path, sample);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput;
 }
-#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */
+#endif /* defined(__AO__) */
 
 template<uint node_feature_mask>
 ccl_device bool integrate_surface(KernelGlobals kg,
@@ -474,14 +487,12 @@ ccl_device bool integrate_surface(KernelGlobals kg,
     PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
     integrate_surface_direct_light(kg, state, &sd, &rng_state);
 
-#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+#if defined(__AO__)
     /* Ambient occlusion pass. */
-    if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) {
-      if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
-          (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
-        PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
-        integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer);
-      }
+    if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
+        (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
+      PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
+      integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer);
     }
 #endif
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
index 13a5e7bda05..1dd701237a8 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -776,7 +776,8 @@ ccl_device_forceinline void integrate_volume_direct_light(
   const bool is_light = light_sample_is_light(ls);
 
   /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+  INTEGRATOR_SHADOW_PATH_INIT(
+      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
 
   /* Write shadow ray and associated state to global memory. */
   integrator_state_write_shadow_ray(kg, shadow_state, &ray);
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
index 4f21ab35d1f..84efcb00349 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -92,6 +92,7 @@ typedef struct IntegratorStateCPU {
 #undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 
   IntegratorShadowStateCPU shadow;
+  IntegratorShadowStateCPU ao;
 } IntegratorStateCPU;
 
 /* Path Queue
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h
index df8fb5e0e46..1569bf68e24 100644
--- a/intern/cycles/kernel/integrator/integrator_state_flow.h
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@@ -63,7 +63,7 @@ CCL_NAMESPACE_BEGIN
         &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
     INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
 
-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
     IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \
         &kernel_integrator_state.next_shadow_path_index[0], 1); \
     atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
@@ -129,8 +129,8 @@ CCL_NAMESPACE_BEGIN
       (void)current_kernel; \
     }
 
-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
-    IntegratorShadowState shadow_state = &state->shadow; \
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
+    IntegratorShadowState shadow_state = &state->shadow_type; \
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
 #  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
     { \
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h
index e9517a82453..e3bf9db80f7 100644
--- a/intern/cycles/kernel/integrator/integrator_subsurface.h
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@@ -182,7 +182,7 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
 
   const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
   const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
-  if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+  if (shader_flags & SD_HAS_RAYTRACE) {
     INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                 DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
                                 shader);
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 848aaa18aae..54492bef974 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -408,6 +408,13 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
   const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
   const int sample = INTEGRATOR_STATE(state, shadow_path, sample);
 
+  /* Ambient occlusion. */
+  if (path_flag & PATH_RAY_SHADOW_FOR_AO) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, contribution);
+    return;
+  }
+
+  /* Direct light shadow. */
   kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);
 
 #ifdef __PASSES__
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index fa8de14916e..428eb3498f7 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -28,6 +28,7 @@ ccl_device_inline void path_state_init_queues(IntegratorState state)
   INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
 #ifdef __KERNEL_CPU__
   INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(&state->ao, shadow_path, queued_kernel) = 0;
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index fa8453b99cb..4bdd8185ca6 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -285,21 +285,22 @@ enum PathRayFlag {
   PATH_RAY_VOLUME_PASS = (1U << 26U),
   PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),
 
-  /* Shadow ray is for a light or surface. */
+  /* Shadow ray is for a light or surface, or AO. */
   PATH_RAY_SHADOW_FOR_LIGHT = (1U << 27U),
+  PATH_RAY_SHADOW_FOR_AO = (1U << 28U),
 
   /* A shadow catcher object was hit and the path was split into two. */
-  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 28U),
+  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 29U),
 
   /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
    * their dedicated pass for later division.
    *
    * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
    * which is separate from the light passes. */
-  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 29U),
+  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 30U),
 
   /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
-  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 30U),
+  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 31U),
 };
 
 /* Configure ray visibility bits for rays and objects respectively,
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 48f87ea3bf7..381f794545a 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -677,10 +677,6 @@ uint Film::get_kernel_features(const Scene *scene) const
         kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
       }
     }
-
-    if (pass_type == PASS_AO) {
-      kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE;
-    }
   }
 
   return kernel_features;