Cycles: avoid intermediate stack array for writing shadow intersections

Helps save one OptiX payload and is a bit more efficient. Differential Revision: https://developer.blender.org/D12909
author: Brecht Van Lommel <brecht@blender.org> 2021-10-18 20:20:09 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-10-19 16:10:55 +0300
commit: d06828f0b8ebb083de59fd2cb8c5f8fe6af1da22 (patch)
tree: 6ed5d1a8a488d1badaf5b8e462f7c1df2e75cbb2 /intern
parent: 943e73b07e26d64c04ccb7d8f656e3818a57cca0 (diff)
5 files changed, 77 insertions, 64 deletions
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index bdbd574bf0f..0d9ba7e6369 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -34,6 +34,8 @@
 #include "kernel/bvh/bvh_types.h"
 #include "kernel/bvh/bvh_util.h"
 
+#include "kernel/integrator/integrator_state_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifndef __KERNEL_OPTIX__
@@ -361,15 +363,15 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
 
 #ifdef __SHADOW_RECORD_ALL__
 ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
+                                                     IntegratorShadowState state,
                                                      ccl_private const Ray *ray,
-                                                     ccl_private Intersection *isect,
                                                      uint visibility,
                                                      uint max_hits,
                                                      ccl_private uint *num_hits)
 {
 #  ifdef __KERNEL_OPTIX__
-  uint p0 = pointer_pack_to_uint_0(isect);
-  uint p1 = pointer_pack_to_uint_1(isect);
+  uint p0 = state;
+  uint p1 = 0; /* Unused */
   uint p2 = 0; /* Number of hits. */
   uint p3 = max_hits;
   uint p4 = visibility;
@@ -412,7 +414,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
 #    ifdef __EMBREE__
   if (kernel_data.bvh.scene) {
     CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
-    ctx.isect_s = isect;
+    Intersection *isect_array = (Intersection *)state->shadow_isect;
+    ctx.isect_s = isect_array;
     ctx.max_hits = max_hits;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
@@ -428,21 +431,21 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
   if (kernel_data.bvh.have_motion) {
 #      ifdef __HAIR__
     if (kernel_data.bvh.have_curves) {
-      return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, visibility, max_hits, num_hits);
+      return bvh_intersect_shadow_all_hair_motion(kg, ray, state, visibility, max_hits, num_hits);
     }
 #      endif /* __HAIR__ */
 
-    return bvh_intersect_shadow_all_motion(kg, ray, isect, visibility, max_hits, num_hits);
+    return bvh_intersect_shadow_all_motion(kg, ray, state, visibility, max_hits, num_hits);
   }
 #    endif   /* __OBJECT_MOTION__ */
 
 #    ifdef __HAIR__
   if (kernel_data.bvh.have_curves) {
-    return bvh_intersect_shadow_all_hair(kg, ray, isect, visibility, max_hits, num_hits);
+    return bvh_intersect_shadow_all_hair(kg, ray, state, visibility, max_hits, num_hits);
   }
 #    endif /* __HAIR__ */
 
-  return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits);
+  return bvh_intersect_shadow_all(kg, ray, state, visibility, max_hits, num_hits);
 #  endif   /* __KERNEL_OPTIX__ */
 }
 #endif /* __SHADOW_RECORD_ALL__ */
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 42ab9eda37e..b997235b6e4 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -38,7 +38,7 @@ ccl_device_inline
 #endif
     bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
                                      ccl_private const Ray *ray,
-                                     ccl_private Intersection *isect_array,
+                                     IntegratorShadowState state,
                                      const uint visibility,
                                      const uint max_hits,
                                      ccl_private uint *num_hits)
@@ -227,12 +227,13 @@ ccl_device_inline
                  * the largest distance to potentially replace when another hit
                  * is found. */
                 const int num_recorded_hits = min(max_hits, record_index);
-                float max_recorded_t = isect_array[0].t;
+                float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
                 int max_recorded_hit = 0;
 
                 for (int i = 1; i < num_recorded_hits; i++) {
-                  if (isect_array[i].t > max_recorded_t) {
-                    max_recorded_t = isect_array[i].t;
+                  const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+                  if (isect_t > max_recorded_t) {
+                    max_recorded_t = isect_t;
                     max_recorded_hit = i;
                   }
                 }
@@ -246,7 +247,7 @@ ccl_device_inline
                 t_max_current = t_max_world * t_world_to_instance;
               }
 
-              isect_array[record_index] = isect;
+              integrator_state_write_shadow_isect(state, &isect, record_index);
             }
 
             prim_addr++;
@@ -300,12 +301,12 @@ ccl_device_inline
 
 ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals kg,
                                          ccl_private const Ray *ray,
-                                         ccl_private Intersection *isect_array,
+                                         IntegratorShadowState state,
                                          const uint visibility,
                                          const uint max_hits,
                                          ccl_private uint *num_hits)
 {
-  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, state, visibility, max_hits, num_hits);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h
index d45eeec4815..869311b38e2 100644
--- a/intern/cycles/kernel/bvh/bvh_util.h
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@@ -71,8 +71,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
-#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
-/* TODO: Move to another file? */
+#if defined(__KERNEL_CPU__)
 ccl_device int intersections_compare(const void *a, const void *b)
 {
   const Intersection *isect_a = (const Intersection *)a;
@@ -87,32 +86,6 @@ ccl_device int intersections_compare(const void *a, const void *b)
 }
 #endif
 
-#if defined(__SHADOW_RECORD_ALL__)
-ccl_device_inline void sort_intersections(ccl_private Intersection *hits, uint num_hits)
-{
-  kernel_assert(num_hits > 0);
-
-#  ifdef __KERNEL_GPU__
-  /* Use bubble sort which has more friendly memory pattern on GPU. */
-  bool swapped;
-  do {
-    swapped = false;
-    for (int j = 0; j < num_hits - 1; ++j) {
-      if (hits[j].t > hits[j + 1].t) {
-        struct Intersection tmp = hits[j];
-        hits[j] = hits[j + 1];
-        hits[j + 1] = tmp;
-        swapped = true;
-      }
-    }
-    --num_hits;
-  } while (swapped);
-#  else
-  qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-#  endif
-}
-#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
-
 /* For subsurface scattering, only sorting a small amount of intersections
  * so bubble sort is fine for CPU and GPU. */
 ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *hits,
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index e97b25d31a2..574f66ab708 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -225,16 +225,17 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 
   optixSetPayload_2(num_hits + 1);
 
-  Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+  const IntegratorShadowState state = optixGetPayload_0();
 
   if (record_index >= max_hits) {
     /* If maximum number of hits reached, find a hit to replace. */
-    float max_recorded_t = isect_array[0].t;
+    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
     int max_recorded_hit = 0;
 
     for (int i = 1; i < max_hits; i++) {
-      if (isect_array[i].t > max_recorded_t) {
-        max_recorded_t = isect_array[i].t;
+      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+      if (isect_t > max_recorded_t) {
+        max_recorded_t = isect_t;
         max_recorded_hit = i;
       }
     }
@@ -248,13 +249,12 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     record_index = max_recorded_hit;
   }
 
-  Intersection *const isect = isect_array + record_index;
-  isect->u = u;
-  isect->v = v;
-  isect->t = optixGetRayTmax();
-  isect->prim = prim;
-  isect->object = object;
-  isect->type = type;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
 
   optixIgnoreIntersection();
 #  endif /* __TRANSPARENT_SHADOWS__ */
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
index 9dc0eb02c9b..d5c6ec145f0 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@@ -64,19 +64,61 @@ ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals k
 }
 
 #ifdef __TRANSPARENT_SHADOWS__
+#  if defined(__KERNEL_CPU__)
+ccl_device int shadow_intersections_compare(const void *a, const void *b)
+{
+  const Intersection *isect_a = (const Intersection *)a;
+  const Intersection *isect_b = (const Intersection *)b;
+
+  if (isect_a->t < isect_b->t)
+    return -1;
+  else if (isect_a->t > isect_b->t)
+    return 1;
+  else
+    return 0;
+}
+#  endif
+
+ccl_device_inline void sort_shadow_intersections(IntegratorShadowState state, uint num_hits)
+{
+  kernel_assert(num_hits > 0);
+
+#  ifdef __KERNEL_GPU__
+  /* Use bubble sort which has more friendly memory pattern on GPU. */
+  bool swapped;
+  do {
+    swapped = false;
+    for (int j = 0; j < num_hits - 1; ++j) {
+      if (INTEGRATOR_STATE_ARRAY(state, shadow_isect, j, t) >
+          INTEGRATOR_STATE_ARRAY(state, shadow_isect, j + 1, t)) {
+        struct Intersection tmp_j ccl_optional_struct_init;
+        struct Intersection tmp_j_1 ccl_optional_struct_init;
+        integrator_state_read_shadow_isect(state, &tmp_j, j);
+        integrator_state_read_shadow_isect(state, &tmp_j_1, j + 1);
+        integrator_state_write_shadow_isect(state, &tmp_j_1, j);
+        integrator_state_write_shadow_isect(state, &tmp_j, j + 1);
+        swapped = true;
+      }
+    }
+    --num_hits;
+  } while (swapped);
+#  else
+  Intersection *isect_array = (Intersection *)state->shadow_isect;
+  qsort(isect_array, num_hits, sizeof(Intersection), shadow_intersections_compare);
+#  endif
+}
+
 ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg,
                                                        IntegratorShadowState state,
                                                        ccl_private const Ray *ray,
                                                        const uint visibility)
 {
-  Intersection isect[INTEGRATOR_SHADOW_ISECT_SIZE];
-
   /* Limit the number hits to the max transparent bounces allowed and the size that we
    * have available in the integrator state. */
   const uint max_transparent_hits = integrate_shadow_max_transparent_hits(kg, state);
   const uint max_hits = min(max_transparent_hits, (uint)INTEGRATOR_SHADOW_ISECT_SIZE);
   uint num_hits = 0;
-  bool opaque_hit = scene_intersect_shadow_all(kg, ray, isect, visibility, max_hits, &num_hits);
+  bool opaque_hit = scene_intersect_shadow_all(kg, state, ray, visibility, max_hits, &num_hits);
 
   /* If number of hits exceed the transparent bounces limit, make opaque. */
   if (num_hits > max_transparent_hits) {
@@ -87,13 +129,7 @@ ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg,
     uint num_recorded_hits = min(num_hits, max_hits);
 
     if (num_recorded_hits > 0) {
-      sort_intersections(isect, num_recorded_hits);
-
-      /* Write intersection result into global integrator state memory.
-       * More efficient may be to do this directly from the intersection kernel. */
-      for (int hit = 0; hit < num_recorded_hits; hit++) {
-        integrator_state_write_shadow_isect(state, &isect[hit], hit);
-      }
+      sort_shadow_intersections(state, num_recorded_hits);
     }
 
     INTEGRATOR_STATE_WRITE(state, shadow_path, num_hits) = num_hits;
author	Brecht Van Lommel <brecht@blender.org>	2021-10-18 20:20:09 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-10-19 16:10:55 +0300
commit	d06828f0b8ebb083de59fd2cb8c5f8fe6af1da22 (patch)
tree	6ed5d1a8a488d1badaf5b8e462f7c1df2e75cbb2 /intern
parent	943e73b07e26d64c04ccb7d8f656e3818a57cca0 (diff)