From d06828f0b8ebb083de59fd2cb8c5f8fe6af1da22 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Mon, 18 Oct 2021 19:20:09 +0200 Subject: Cycles: avoid intermediate stack array for writing shadow intersections Helps save one OptiX payload and is a bit more efficient. Differential Revision: https://developer.blender.org/D12909 --- intern/cycles/kernel/bvh/bvh.h | 19 ++++---- intern/cycles/kernel/bvh/bvh_shadow_all.h | 15 +++--- intern/cycles/kernel/bvh/bvh_util.h | 29 +---------- intern/cycles/kernel/device/optix/kernel.cu | 22 ++++----- .../integrator/integrator_intersect_shadow.h | 56 ++++++++++++++++++---- 5 files changed, 77 insertions(+), 64 deletions(-) (limited to 'intern') diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index bdbd574bf0f..0d9ba7e6369 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -34,6 +34,8 @@ #include "kernel/bvh/bvh_types.h" #include "kernel/bvh/bvh_util.h" +#include "kernel/integrator/integrator_state_util.h" + CCL_NAMESPACE_BEGIN #ifndef __KERNEL_OPTIX__ @@ -361,15 +363,15 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, #ifdef __SHADOW_RECORD_ALL__ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, + IntegratorShadowState state, ccl_private const Ray *ray, - ccl_private Intersection *isect, uint visibility, uint max_hits, ccl_private uint *num_hits) { # ifdef __KERNEL_OPTIX__ - uint p0 = pointer_pack_to_uint_0(isect); - uint p1 = pointer_pack_to_uint_1(isect); + uint p0 = state; + uint p1 = 0; /* Unused */ uint p2 = 0; /* Number of hits. */ uint p3 = max_hits; uint p4 = visibility; @@ -412,7 +414,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, # ifdef __EMBREE__ if (kernel_data.bvh.scene) { CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL); - ctx.isect_s = isect; + Intersection *isect_array = (Intersection *)state->shadow_isect; + ctx.isect_s = isect_array; ctx.max_hits = max_hits; IntersectContext rtc_ctx(&ctx); RTCRay rtc_ray; @@ -428,21 +431,21 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, if (kernel_data.bvh.have_motion) { # ifdef __HAIR__ if (kernel_data.bvh.have_curves) { - return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, visibility, max_hits, num_hits); + return bvh_intersect_shadow_all_hair_motion(kg, ray, state, visibility, max_hits, num_hits); } # endif /* __HAIR__ */ - return bvh_intersect_shadow_all_motion(kg, ray, isect, visibility, max_hits, num_hits); + return bvh_intersect_shadow_all_motion(kg, ray, state, visibility, max_hits, num_hits); } # endif /* __OBJECT_MOTION__ */ # ifdef __HAIR__ if (kernel_data.bvh.have_curves) { - return bvh_intersect_shadow_all_hair(kg, ray, isect, visibility, max_hits, num_hits); + return bvh_intersect_shadow_all_hair(kg, ray, state, visibility, max_hits, num_hits); } # endif /* __HAIR__ */ - return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits); + return bvh_intersect_shadow_all(kg, ray, state, visibility, max_hits, num_hits); # endif /* __KERNEL_OPTIX__ */ } #endif /* __SHADOW_RECORD_ALL__ */ diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 42ab9eda37e..b997235b6e4 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -38,7 +38,7 @@ ccl_device_inline #endif bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg, ccl_private const Ray *ray, - ccl_private Intersection *isect_array, + IntegratorShadowState state, const uint visibility, const uint max_hits, ccl_private uint *num_hits) @@ -227,12 +227,13 @@ ccl_device_inline * the largest distance to potentially replace when another hit * is found. */ const int num_recorded_hits = min(max_hits, record_index); - float max_recorded_t = isect_array[0].t; + float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t); int max_recorded_hit = 0; for (int i = 1; i < num_recorded_hits; i++) { - if (isect_array[i].t > max_recorded_t) { - max_recorded_t = isect_array[i].t; + const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t); + if (isect_t > max_recorded_t) { + max_recorded_t = isect_t; max_recorded_hit = i; } } @@ -246,7 +247,7 @@ ccl_device_inline t_max_current = t_max_world * t_world_to_instance; } - isect_array[record_index] = isect; + integrator_state_write_shadow_isect(state, &isect, record_index); } prim_addr++; @@ -300,12 +301,12 @@ ccl_device_inline ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals kg, ccl_private const Ray *ray, - ccl_private Intersection *isect_array, + IntegratorShadowState state, const uint visibility, const uint max_hits, ccl_private uint *num_hits) { - return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits); + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, state, visibility, max_hits, num_hits); } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h index d45eeec4815..869311b38e2 100644 --- a/intern/cycles/kernel/bvh/bvh_util.h +++ b/intern/cycles/kernel/bvh/bvh_util.h @@ -71,8 +71,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng) #endif } -#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__)) -/* TODO: Move to another file? */ +#if defined(__KERNEL_CPU__) ccl_device int intersections_compare(const void *a, const void *b) { const Intersection *isect_a = (const Intersection *)a; @@ -87,32 +86,6 @@ ccl_device int intersections_compare(const void *a, const void *b) } #endif -#if defined(__SHADOW_RECORD_ALL__) -ccl_device_inline void sort_intersections(ccl_private Intersection *hits, uint num_hits) -{ - kernel_assert(num_hits > 0); - -# ifdef __KERNEL_GPU__ - /* Use bubble sort which has more friendly memory pattern on GPU. */ - bool swapped; - do { - swapped = false; - for (int j = 0; j < num_hits - 1; ++j) { - if (hits[j].t > hits[j + 1].t) { - struct Intersection tmp = hits[j]; - hits[j] = hits[j + 1]; - hits[j + 1] = tmp; - swapped = true; - } - } - --num_hits; - } while (swapped); -# else - qsort(hits, num_hits, sizeof(Intersection), intersections_compare); -# endif -} -#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */ - /* For subsurface scattering, only sorting a small amount of intersections * so bubble sort is fine for CPU and GPU. */ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *hits, diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu index e97b25d31a2..574f66ab708 100644 --- a/intern/cycles/kernel/device/optix/kernel.cu +++ b/intern/cycles/kernel/device/optix/kernel.cu @@ -225,16 +225,17 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() optixSetPayload_2(num_hits + 1); - Intersection *const isect_array = get_payload_ptr_0(); + const IntegratorShadowState state = optixGetPayload_0(); if (record_index >= max_hits) { /* If maximum number of hits reached, find a hit to replace. */ - float max_recorded_t = isect_array[0].t; + float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t); int max_recorded_hit = 0; for (int i = 1; i < max_hits; i++) { - if (isect_array[i].t > max_recorded_t) { - max_recorded_t = isect_array[i].t; + const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t); + if (isect_t > max_recorded_t) { + max_recorded_t = isect_t; max_recorded_hit = i; } } @@ -248,13 +249,12 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() record_index = max_recorded_hit; } - Intersection *const isect = isect_array + record_index; - isect->u = u; - isect->v = v; - isect->t = optixGetRayTmax(); - isect->prim = prim; - isect->object = object; - isect->type = type; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax(); + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type; optixIgnoreIntersection(); # endif /* __TRANSPARENT_SHADOWS__ */ diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h index 9dc0eb02c9b..d5c6ec145f0 100644 --- a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h +++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h @@ -64,19 +64,61 @@ ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals k } #ifdef __TRANSPARENT_SHADOWS__ +# if defined(__KERNEL_CPU__) +ccl_device int shadow_intersections_compare(const void *a, const void *b) +{ + const Intersection *isect_a = (const Intersection *)a; + const Intersection *isect_b = (const Intersection *)b; + + if (isect_a->t < isect_b->t) + return -1; + else if (isect_a->t > isect_b->t) + return 1; + else + return 0; +} +# endif + +ccl_device_inline void sort_shadow_intersections(IntegratorShadowState state, uint num_hits) +{ + kernel_assert(num_hits > 0); + +# ifdef __KERNEL_GPU__ + /* Use bubble sort which has more friendly memory pattern on GPU. */ + bool swapped; + do { + swapped = false; + for (int j = 0; j < num_hits - 1; ++j) { + if (INTEGRATOR_STATE_ARRAY(state, shadow_isect, j, t) > + INTEGRATOR_STATE_ARRAY(state, shadow_isect, j + 1, t)) { + struct Intersection tmp_j ccl_optional_struct_init; + struct Intersection tmp_j_1 ccl_optional_struct_init; + integrator_state_read_shadow_isect(state, &tmp_j, j); + integrator_state_read_shadow_isect(state, &tmp_j_1, j + 1); + integrator_state_write_shadow_isect(state, &tmp_j_1, j); + integrator_state_write_shadow_isect(state, &tmp_j, j + 1); + swapped = true; + } + } + --num_hits; + } while (swapped); +# else + Intersection *isect_array = (Intersection *)state->shadow_isect; + qsort(isect_array, num_hits, sizeof(Intersection), shadow_intersections_compare); +# endif +} + ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg, IntegratorShadowState state, ccl_private const Ray *ray, const uint visibility) { - Intersection isect[INTEGRATOR_SHADOW_ISECT_SIZE]; - /* Limit the number hits to the max transparent bounces allowed and the size that we * have available in the integrator state. */ const uint max_transparent_hits = integrate_shadow_max_transparent_hits(kg, state); const uint max_hits = min(max_transparent_hits, (uint)INTEGRATOR_SHADOW_ISECT_SIZE); uint num_hits = 0; - bool opaque_hit = scene_intersect_shadow_all(kg, ray, isect, visibility, max_hits, &num_hits); + bool opaque_hit = scene_intersect_shadow_all(kg, state, ray, visibility, max_hits, &num_hits); /* If number of hits exceed the transparent bounces limit, make opaque. */ if (num_hits > max_transparent_hits) { @@ -87,13 +129,7 @@ ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg, uint num_recorded_hits = min(num_hits, max_hits); if (num_recorded_hits > 0) { - sort_intersections(isect, num_recorded_hits); - - /* Write intersection result into global integrator state memory. - * More efficient may be to do this directly from the intersection kernel. */ - for (int hit = 0; hit < num_recorded_hits; hit++) { - integrator_state_write_shadow_isect(state, &isect[hit], hit); - } + sort_shadow_intersections(state, num_recorded_hits); } INTEGRATOR_STATE_WRITE(state, shadow_path, num_hits) = num_hits; -- cgit v1.2.3