diff options
author | Campbell Barton <ideasman42@gmail.com> | 2019-04-17 07:17:24 +0300 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2019-04-17 07:21:24 +0300 |
commit | e12c08e8d170b7ca40f204a5b0423c23a9fbc2c1 (patch) | |
tree | 8cf3453d12edb177a218ef8009357518ec6cab6a /intern/cycles/kernel/bvh | |
parent | b3dabc200a4b0399ec6b81f2ff2730d07b44fcaa (diff) |
ClangFormat: apply to source, most of intern
Apply clang format as proposed in T53211.
For details on usage and instructions for migrating branches
without conflicts, see:
https://wiki.blender.org/wiki/Tools/ClangFormat
Diffstat (limited to 'intern/cycles/kernel/bvh')
21 files changed, 6505 insertions, 6626 deletions
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index e5f807833f3..13e72ed299f 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -57,19 +57,19 @@ CCL_NAMESPACE_BEGIN #if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_hair -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_HAIR_MINIMUM_WIDTH # include "kernel/bvh/bvh_traversal.h" #endif #if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION # include "kernel/bvh/bvh_traversal.h" #endif #if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_hair_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_HAIR_MINIMUM_WIDTH | BVH_MOTION # include "kernel/bvh/bvh_traversal.h" #endif @@ -82,10 +82,10 @@ CCL_NAMESPACE_BEGIN # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_local_motion -# define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR # include "kernel/bvh/bvh_local.h" # endif -#endif /* __BVH_LOCAL__ */ +#endif /* __BVH_LOCAL__ */ /* Volume BVH traversal */ @@ -96,16 +96,16 @@ CCL_NAMESPACE_BEGIN # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR # include "kernel/bvh/bvh_volume.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR # include "kernel/bvh/bvh_volume.h" # endif -#endif /* __VOLUME__ */ +#endif /* __VOLUME__ */ /* Record all intersections - Shadow BVH traversal */ @@ -122,22 +122,22 @@ CCL_NAMESPACE_BEGIN # if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR # include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION # include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_MOTION # include "kernel/bvh/bvh_shadow_all.h" # endif -#endif /* __SHADOW_RECORD_ALL__ */ +#endif /* __SHADOW_RECORD_ALL__ */ /* Record all intersections - Volume BVH traversal */ @@ -148,16 +148,16 @@ CCL_NAMESPACE_BEGIN # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR # include "kernel/bvh/bvh_volume_all.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR # include "kernel/bvh/bvh_volume_all.h" # endif -#endif /* __VOLUME_RECORD_ALL__ */ +#endif /* __VOLUME_RECORD_ALL__ */ #undef BVH_FEATURE #undef BVH_NAME_JOIN @@ -166,15 +166,15 @@ CCL_NAMESPACE_BEGIN ccl_device_inline bool scene_intersect_valid(const Ray *ray) { - /* NOTE: Due to some vectorization code non-finite origin point might - * cause lots of false-positive intersections which will overflow traversal - * stack. - * This code is a quick way to perform early output, to avoid crashes in - * such cases. - * From production scenes so far it seems it's enough to test first element - * only. - */ - return isfinite(ray->P.x); + /* NOTE: Due to some vectorization code non-finite origin point might + * cause lots of false-positive intersections which will overflow traversal + * stack. + * This code is a quick way to perform early output, to avoid crashes in + * such cases. + * From production scenes so far it seems it's enough to test first element + * only. + */ + return isfinite(ray->P.x); } /* Note: ray is passed by value to work around a possible CUDA compiler bug. */ @@ -186,59 +186,60 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, float difl, float extmax) { - PROFILING_INIT(kg, PROFILING_INTERSECT); + PROFILING_INIT(kg, PROFILING_INTERSECT); - if(!scene_intersect_valid(&ray)) { - return false; - } + if (!scene_intersect_valid(&ray)) { + return false; + } #ifdef __EMBREE__ - if(kernel_data.bvh.scene) { - isect->t = ray.t; - CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR); - IntersectContext rtc_ctx(&ctx); - RTCRayHit ray_hit; - kernel_embree_setup_rayhit(ray, ray_hit, visibility); - rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit); - if(ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID && ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) { - kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect); - return true; - } - return false; - } -#endif /* __EMBREE__ */ + if (kernel_data.bvh.scene) { + isect->t = ray.t; + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR); + IntersectContext rtc_ctx(&ctx); + RTCRayHit ray_hit; + kernel_embree_setup_rayhit(ray, ray_hit, visibility); + rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit); + if (ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID && + ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) { + kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect); + return true; + } + return false; + } +#endif /* __EMBREE__ */ #ifdef __OBJECT_MOTION__ - if(kernel_data.bvh.have_motion) { + if (kernel_data.bvh.have_motion) { # ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax); -# endif /* __HAIR__ */ + if (kernel_data.bvh.have_curves) + return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax); +# endif /* __HAIR__ */ - return bvh_intersect_motion(kg, &ray, isect, visibility); - } -#endif /* __OBJECT_MOTION__ */ + return bvh_intersect_motion(kg, &ray, isect, visibility); + } +#endif /* __OBJECT_MOTION__ */ #ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax); -#endif /* __HAIR__ */ + if (kernel_data.bvh.have_curves) + return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax); +#endif /* __HAIR__ */ #ifdef __KERNEL_CPU__ # ifdef __INSTANCING__ - if(kernel_data.bvh.have_instancing) - return bvh_intersect_instancing(kg, &ray, isect, visibility); -# endif /* __INSTANCING__ */ + if (kernel_data.bvh.have_instancing) + return bvh_intersect_instancing(kg, &ray, isect, visibility); +# endif /* __INSTANCING__ */ - return bvh_intersect(kg, &ray, isect, visibility); -#else /* __KERNEL_CPU__ */ + return bvh_intersect(kg, &ray, isect, visibility); +#else /* __KERNEL_CPU__ */ # ifdef __INSTANCING__ - return bvh_intersect_instancing(kg, &ray, isect, visibility); + return bvh_intersect_instancing(kg, &ray, isect, visibility); # else - return bvh_intersect(kg, &ray, isect, visibility); -# endif /* __INSTANCING__ */ + return bvh_intersect(kg, &ray, isect, visibility); +# endif /* __INSTANCING__ */ -#endif /* __KERNEL_CPU__ */ +#endif /* __KERNEL_CPU__ */ } #ifdef __BVH_LOCAL__ @@ -250,77 +251,61 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg, uint *lcg_state, int max_hits) { - PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL); + PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL); - if(!scene_intersect_valid(&ray)) { - local_isect->num_hits = 0; - return false; - } -#ifdef __EMBREE__ - if(kernel_data.bvh.scene) { - CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SSS); - ctx.lcg_state = lcg_state; - ctx.max_hits = max_hits; - ctx.ss_isect = local_isect; - local_isect->num_hits = 0; - ctx.sss_object_id = local_object; - IntersectContext rtc_ctx(&ctx); - RTCRay rtc_ray; - kernel_embree_setup_ray(ray, rtc_ray, PATH_RAY_ALL_VISIBILITY); - - /* Get the Embree scene for this intersection. */ - RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2); - if(geom) { - float3 P = ray.P; - float3 dir = ray.D; - float3 idir = ray.D; - const int object_flag = kernel_tex_fetch(__object_flag, local_object); - if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { - Transform ob_itfm; - rtc_ray.tfar = bvh_instance_motion_push(kg, - local_object, - &ray, - &P, - &dir, - &idir, - ray.t, - &ob_itfm); - /* bvh_instance_motion_push() returns the inverse transform but - * it's not needed here. */ - (void) ob_itfm; - - rtc_ray.org_x = P.x; - rtc_ray.org_y = P.y; - rtc_ray.org_z = P.z; - rtc_ray.dir_x = dir.x; - rtc_ray.dir_y = dir.y; - rtc_ray.dir_z = dir.z; - } - RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom); - if(scene) { - rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray); - } - } - - return local_isect->num_hits > 0; - } -#endif /* __EMBREE__ */ -#ifdef __OBJECT_MOTION__ - if(kernel_data.bvh.have_motion) { - return bvh_intersect_local_motion(kg, - &ray, - local_isect, - local_object, - lcg_state, - max_hits); - } -#endif /* __OBJECT_MOTION__ */ - return bvh_intersect_local(kg, - &ray, - local_isect, - local_object, - lcg_state, - max_hits); + if (!scene_intersect_valid(&ray)) { + local_isect->num_hits = 0; + return false; + } +# ifdef __EMBREE__ + if (kernel_data.bvh.scene) { + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SSS); + ctx.lcg_state = lcg_state; + ctx.max_hits = max_hits; + ctx.ss_isect = local_isect; + local_isect->num_hits = 0; + ctx.sss_object_id = local_object; + IntersectContext rtc_ctx(&ctx); + RTCRay rtc_ray; + kernel_embree_setup_ray(ray, rtc_ray, PATH_RAY_ALL_VISIBILITY); + + /* Get the Embree scene for this intersection. */ + RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2); + if (geom) { + float3 P = ray.P; + float3 dir = ray.D; + float3 idir = ray.D; + const int object_flag = kernel_tex_fetch(__object_flag, local_object); + if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + Transform ob_itfm; + rtc_ray.tfar = bvh_instance_motion_push( + kg, local_object, &ray, &P, &dir, &idir, ray.t, &ob_itfm); + /* bvh_instance_motion_push() returns the inverse transform but + * it's not needed here. */ + (void)ob_itfm; + + rtc_ray.org_x = P.x; + rtc_ray.org_y = P.y; + rtc_ray.org_z = P.z; + rtc_ray.dir_x = dir.x; + rtc_ray.dir_y = dir.y; + rtc_ray.dir_z = dir.z; + } + RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom); + if (scene) { + rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray); + } + } + + return local_isect->num_hits > 0; + } +# endif /* __EMBREE__ */ +# ifdef __OBJECT_MOTION__ + if (kernel_data.bvh.have_motion) { + return bvh_intersect_local_motion(kg, &ray, local_isect, local_object, lcg_state, max_hits); + } +# endif /* __OBJECT_MOTION__ */ + return bvh_intersect_local(kg, &ray, local_isect, local_object, lcg_state, max_hits); } #endif @@ -332,82 +317,57 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, uint max_hits, uint *num_hits) { - PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL); + PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL); - if(!scene_intersect_valid(ray)) { - *num_hits = 0; - return false; - } + if (!scene_intersect_valid(ray)) { + *num_hits = 0; + return false; + } # ifdef __EMBREE__ - if(kernel_data.bvh.scene) { - CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL); - ctx.isect_s = isect; - ctx.max_hits = max_hits; - ctx.num_hits = 0; - IntersectContext rtc_ctx(&ctx); - RTCRay rtc_ray; - kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_SHADOW); - rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); - - if(ctx.num_hits > max_hits) { - return true; - } - *num_hits = ctx.num_hits; - return rtc_ray.tfar == -INFINITY; - } + if (kernel_data.bvh.scene) { + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL); + ctx.isect_s = isect; + ctx.max_hits = max_hits; + ctx.num_hits = 0; + IntersectContext rtc_ctx(&ctx); + RTCRay rtc_ray; + kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_SHADOW); + rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); + + if (ctx.num_hits > max_hits) { + return true; + } + *num_hits = ctx.num_hits; + return rtc_ray.tfar == -INFINITY; + } # endif # ifdef __OBJECT_MOTION__ - if(kernel_data.bvh.have_motion) { + if (kernel_data.bvh.have_motion) { # ifdef __HAIR__ - if(kernel_data.bvh.have_curves) { - return bvh_intersect_shadow_all_hair_motion(kg, - ray, - isect, - visibility, - max_hits, - num_hits); - } -# endif /* __HAIR__ */ - - return bvh_intersect_shadow_all_motion(kg, - ray, - isect, - visibility, - max_hits, - num_hits); - } -# endif /* __OBJECT_MOTION__ */ + if (kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, visibility, max_hits, num_hits); + } +# endif /* __HAIR__ */ + + return bvh_intersect_shadow_all_motion(kg, ray, isect, visibility, max_hits, num_hits); + } +# endif /* __OBJECT_MOTION__ */ # ifdef __HAIR__ - if(kernel_data.bvh.have_curves) { - return bvh_intersect_shadow_all_hair(kg, - ray, - isect, - visibility, - max_hits, - num_hits); - } -# endif /* __HAIR__ */ + if (kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair(kg, ray, isect, visibility, max_hits, num_hits); + } +# endif /* __HAIR__ */ # ifdef __INSTANCING__ - if(kernel_data.bvh.have_instancing) { - return bvh_intersect_shadow_all_instancing(kg, - ray, - isect, - visibility, - max_hits, - num_hits); - } -# endif /* __INSTANCING__ */ - - return bvh_intersect_shadow_all(kg, - ray, - isect, - visibility, - max_hits, - num_hits); + if (kernel_data.bvh.have_instancing) { + return bvh_intersect_shadow_all_instancing(kg, ray, isect, visibility, max_hits, num_hits); + } +# endif /* __INSTANCING__ */ + + return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits); } -#endif /* __SHADOW_RECORD_ALL__ */ +#endif /* __SHADOW_RECORD_ALL__ */ #ifdef __VOLUME__ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, @@ -415,31 +375,31 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, Intersection *isect, const uint visibility) { - PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME); + PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME); - if(!scene_intersect_valid(ray)) { - return false; - } + if (!scene_intersect_valid(ray)) { + return false; + } # ifdef __OBJECT_MOTION__ - if(kernel_data.bvh.have_motion) { - return bvh_intersect_volume_motion(kg, ray, isect, visibility); - } -# endif /* __OBJECT_MOTION__ */ + if (kernel_data.bvh.have_motion) { + return bvh_intersect_volume_motion(kg, ray, isect, visibility); + } +# endif /* __OBJECT_MOTION__ */ # ifdef __KERNEL_CPU__ # ifdef __INSTANCING__ - if(kernel_data.bvh.have_instancing) - return bvh_intersect_volume_instancing(kg, ray, isect, visibility); -# endif /* __INSTANCING__ */ - return bvh_intersect_volume(kg, ray, isect, visibility); -# else /* __KERNEL_CPU__ */ + if (kernel_data.bvh.have_instancing) + return bvh_intersect_volume_instancing(kg, ray, isect, visibility); +# endif /* __INSTANCING__ */ + return bvh_intersect_volume(kg, ray, isect, visibility); +# else /* __KERNEL_CPU__ */ # ifdef __INSTANCING__ - return bvh_intersect_volume_instancing(kg, ray, isect, visibility); + return bvh_intersect_volume_instancing(kg, ray, isect, visibility); # else - return bvh_intersect_volume(kg, ray, isect, visibility); -# endif /* __INSTANCING__ */ -# endif /* __KERNEL_CPU__ */ + return bvh_intersect_volume(kg, ray, isect, visibility); +# endif /* __INSTANCING__ */ +# endif /* __KERNEL_CPU__ */ } -#endif /* __VOLUME__ */ +#endif /* __VOLUME__ */ #ifdef __VOLUME_RECORD_ALL__ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg, @@ -448,37 +408,36 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg, const uint max_hits, const uint visibility) { - PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL); + PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL); - if(!scene_intersect_valid(ray)) { - return false; - } + if (!scene_intersect_valid(ray)) { + return false; + } # ifdef __EMBREE__ - if(kernel_data.bvh.scene) { - CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL); - ctx.isect_s = isect; - ctx.max_hits = max_hits; - ctx.num_hits = 0; - IntersectContext rtc_ctx(&ctx); - RTCRay rtc_ray; - kernel_embree_setup_ray(*ray, rtc_ray, visibility); - rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); - return rtc_ray.tfar == -INFINITY; - } + if (kernel_data.bvh.scene) { + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL); + ctx.isect_s = isect; + ctx.max_hits = max_hits; + ctx.num_hits = 0; + IntersectContext rtc_ctx(&ctx); + RTCRay rtc_ray; + kernel_embree_setup_ray(*ray, rtc_ray, visibility); + rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); + return rtc_ray.tfar == -INFINITY; + } # endif # ifdef __OBJECT_MOTION__ - if(kernel_data.bvh.have_motion) { - return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility); - } -# endif /* __OBJECT_MOTION__ */ + if (kernel_data.bvh.have_motion) { + return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility); + } +# endif /* __OBJECT_MOTION__ */ # ifdef __INSTANCING__ - if(kernel_data.bvh.have_instancing) - return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility); -# endif /* __INSTANCING__ */ - return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility); + if (kernel_data.bvh.have_instancing) + return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility); +# endif /* __INSTANCING__ */ + return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility); } -#endif /* __VOLUME_RECORD_ALL__ */ - +#endif /* __VOLUME_RECORD_ALL__ */ /* Ray offset to avoid self intersection. * @@ -488,48 +447,48 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg, ccl_device_inline float3 ray_offset(float3 P, float3 Ng) { #ifdef __INTERSECTION_REFINE__ - const float epsilon_f = 1e-5f; - /* ideally this should match epsilon_f, but instancing and motion blur - * precision makes it problematic */ - const float epsilon_test = 1.0f; - const int epsilon_i = 32; - - float3 res; - - /* x component */ - if(fabsf(P.x) < epsilon_test) { - res.x = P.x + Ng.x*epsilon_f; - } - else { - uint ix = __float_as_uint(P.x); - ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i; - res.x = __uint_as_float(ix); - } - - /* y component */ - if(fabsf(P.y) < epsilon_test) { - res.y = P.y + Ng.y*epsilon_f; - } - else { - uint iy = __float_as_uint(P.y); - iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i; - res.y = __uint_as_float(iy); - } - - /* z component */ - if(fabsf(P.z) < epsilon_test) { - res.z = P.z + Ng.z*epsilon_f; - } - else { - uint iz = __float_as_uint(P.z); - iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i; - res.z = __uint_as_float(iz); - } - - return res; + const float epsilon_f = 1e-5f; + /* ideally this should match epsilon_f, but instancing and motion blur + * precision makes it problematic */ + const float epsilon_test = 1.0f; + const int epsilon_i = 32; + + float3 res; + + /* x component */ + if (fabsf(P.x) < epsilon_test) { + res.x = P.x + Ng.x * epsilon_f; + } + else { + uint ix = __float_as_uint(P.x); + ix += ((ix ^ __float_as_uint(Ng.x)) >> 31) ? -epsilon_i : epsilon_i; + res.x = __uint_as_float(ix); + } + + /* y component */ + if (fabsf(P.y) < epsilon_test) { + res.y = P.y + Ng.y * epsilon_f; + } + else { + uint iy = __float_as_uint(P.y); + iy += ((iy ^ __float_as_uint(Ng.y)) >> 31) ? -epsilon_i : epsilon_i; + res.y = __uint_as_float(iy); + } + + /* z component */ + if (fabsf(P.z) < epsilon_test) { + res.z = P.z + Ng.z * epsilon_f; + } + else { + uint iz = __float_as_uint(P.z); + iz += ((iz ^ __float_as_uint(Ng.z)) >> 31) ? -epsilon_i : epsilon_i; + res.z = __uint_as_float(iz); + } + + return res; #else - const float epsilon_f = 1e-4f; - return P + epsilon_f*Ng; + const float epsilon_f = 1e-4f; + return P + epsilon_f * Ng; #endif } @@ -537,40 +496,40 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng) /* ToDo: Move to another file? */ ccl_device int intersections_compare(const void *a, const void *b) { - const Intersection *isect_a = (const Intersection*)a; - const Intersection *isect_b = (const Intersection*)b; - - if(isect_a->t < isect_b->t) - return -1; - else if(isect_a->t > isect_b->t) - return 1; - else - return 0; + const Intersection *isect_a = (const Intersection *)a; + const Intersection *isect_b = (const Intersection *)b; + + if (isect_a->t < isect_b->t) + return -1; + else if (isect_a->t > isect_b->t) + return 1; + else + return 0; } #endif #if defined(__SHADOW_RECORD_ALL__) ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits) { -#ifdef __KERNEL_GPU__ - /* Use bubble sort which has more friendly memory pattern on GPU. */ - bool swapped; - do { - swapped = false; - for(int j = 0; j < num_hits - 1; ++j) { - if(hits[j].t > hits[j + 1].t) { - struct Intersection tmp = hits[j]; - hits[j] = hits[j + 1]; - hits[j + 1] = tmp; - swapped = true; - } - } - --num_hits; - } while(swapped); -#else - qsort(hits, num_hits, sizeof(Intersection), intersections_compare); -#endif +# ifdef __KERNEL_GPU__ + /* Use bubble sort which has more friendly memory pattern on GPU. */ + bool swapped; + do { + swapped = false; + for (int j = 0; j < num_hits - 1; ++j) { + if (hits[j].t > hits[j + 1].t) { + struct Intersection tmp = hits[j]; + hits[j] = hits[j + 1]; + hits[j + 1] = tmp; + swapped = true; + } + } + --num_hits; + } while (swapped); +# else + qsort(hits, num_hits, sizeof(Intersection), intersections_compare); +# endif } -#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */ +#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h index bfc911a1e76..661bba54fd4 100644 --- a/intern/cycles/kernel/bvh/bvh_embree.h +++ b/intern/cycles/kernel/bvh/bvh_embree.h @@ -24,103 +24,120 @@ CCL_NAMESPACE_BEGIN -struct CCLIntersectContext { - typedef enum { - RAY_REGULAR = 0, - RAY_SHADOW_ALL = 1, - RAY_SSS = 2, - RAY_VOLUME_ALL = 3, +struct CCLIntersectContext { + typedef enum { + RAY_REGULAR = 0, + RAY_SHADOW_ALL = 1, + RAY_SSS = 2, + RAY_VOLUME_ALL = 3, - } RayType; + } RayType; - KernelGlobals *kg; - RayType type; + KernelGlobals *kg; + RayType type; - /* for shadow rays */ - Intersection *isect_s; - int max_hits; - int num_hits; + /* for shadow rays */ + Intersection *isect_s; + int max_hits; + int num_hits; - /* for SSS Rays: */ - LocalIntersection *ss_isect; - int sss_object_id; - uint *lcg_state; + /* for SSS Rays: */ + LocalIntersection *ss_isect; + int sss_object_id; + uint *lcg_state; - CCLIntersectContext(KernelGlobals *kg_, RayType type_) - { - kg = kg_; - type = type_; - max_hits = 1; - num_hits = 0; - isect_s = NULL; - ss_isect = NULL; - sss_object_id = -1; - lcg_state = NULL; - } + CCLIntersectContext(KernelGlobals *kg_, RayType type_) + { + kg = kg_; + type = type_; + max_hits = 1; + num_hits = 0; + isect_s = NULL; + ss_isect = NULL; + sss_object_id = -1; + lcg_state = NULL; + } }; -class IntersectContext -{ -public: - IntersectContext(CCLIntersectContext* ctx) - { - rtcInitIntersectContext(&context); - userRayExt = ctx; - } - RTCIntersectContext context; - CCLIntersectContext* userRayExt; +class IntersectContext { + public: + IntersectContext(CCLIntersectContext *ctx) + { + rtcInitIntersectContext(&context); + userRayExt = ctx; + } + RTCIntersectContext context; + CCLIntersectContext *userRayExt; }; -ccl_device_inline void kernel_embree_setup_ray(const Ray& ray, RTCRay& rtc_ray, const uint visibility) +ccl_device_inline void kernel_embree_setup_ray(const Ray &ray, + RTCRay &rtc_ray, + const uint visibility) { - rtc_ray.org_x = ray.P.x; - rtc_ray.org_y = ray.P.y; - rtc_ray.org_z = ray.P.z; - rtc_ray.dir_x = ray.D.x; - rtc_ray.dir_y = ray.D.y; - rtc_ray.dir_z = ray.D.z; - rtc_ray.tnear = 0.0f; - rtc_ray.tfar = ray.t; - rtc_ray.time = ray.time; - rtc_ray.mask = visibility; + rtc_ray.org_x = ray.P.x; + rtc_ray.org_y = ray.P.y; + rtc_ray.org_z = ray.P.z; + rtc_ray.dir_x = ray.D.x; + rtc_ray.dir_y = ray.D.y; + rtc_ray.dir_z = ray.D.z; + rtc_ray.tnear = 0.0f; + rtc_ray.tfar = ray.t; + rtc_ray.time = ray.time; + rtc_ray.mask = visibility; } -ccl_device_inline void kernel_embree_setup_rayhit(const Ray& ray, RTCRayHit& rayhit, const uint visibility) +ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray, + RTCRayHit &rayhit, + const uint visibility) { - kernel_embree_setup_ray(ray, rayhit.ray, visibility); - rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID; - rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID; + kernel_embree_setup_ray(ray, rayhit.ray, visibility); + rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID; + rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID; } -ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect) +ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg, + const RTCRay *ray, + const RTCHit *hit, + Intersection *isect) { - bool is_hair = hit->geomID & 1; - isect->u = is_hair ? hit->u : 1.0f - hit->v - hit->u; - isect->v = is_hair ? hit->v : hit->u; - isect->t = ray->tfar; - isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z); - if(hit->instID[0] != RTC_INVALID_GEOMETRY_ID) { - RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0])); - isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + kernel_tex_fetch(__object_node, hit->instID[0]/2); - isect->object = hit->instID[0]/2; - } - else { - isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, hit->geomID)); - isect->object = OBJECT_NONE; - } - isect->type = kernel_tex_fetch(__prim_type, isect->prim); + bool is_hair = hit->geomID & 1; + isect->u = is_hair ? hit->u : 1.0f - hit->v - hit->u; + isect->v = is_hair ? hit->v : hit->u; + isect->t = ray->tfar; + isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z); + if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) { + RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData( + rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0])); + isect->prim = hit->primID + + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + + kernel_tex_fetch(__object_node, hit->instID[0] / 2); + isect->object = hit->instID[0] / 2; + } + else { + isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData( + rtcGetGeometry(kernel_data.bvh.scene, hit->geomID)); + isect->object = OBJECT_NONE; + } + isect->type = kernel_tex_fetch(__prim_type, isect->prim); } -ccl_device_inline void kernel_embree_convert_local_hit(KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int local_object_id) +ccl_device_inline void kernel_embree_convert_local_hit(KernelGlobals *kg, + const RTCRay *ray, + const RTCHit *hit, + Intersection *isect, + int local_object_id) { - isect->u = 1.0f - hit->v - hit->u; - isect->v = hit->u; - isect->t = ray->tfar; - isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z); - RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, local_object_id * 2)); - isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + kernel_tex_fetch(__object_node, local_object_id); - isect->object = local_object_id; - isect->type = kernel_tex_fetch(__prim_type, isect->prim); + isect->u = 1.0f - hit->v - hit->u; + isect->v = hit->u; + isect->t = ray->tfar; + isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z); + RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData( + rtcGetGeometry(kernel_data.bvh.scene, local_object_id * 2)); + isect->prim = hit->primID + + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + + kernel_tex_fetch(__object_node, local_object_id); + isect->object = local_object_id; + isect->type = kernel_tex_fetch(__prim_type, isect->prim); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h index 3bdc9293a6c..7a069ef1108 100644 --- a/intern/cycles/kernel/bvh/bvh_local.h +++ b/intern/cycles/kernel/bvh/bvh_local.h @@ -43,208 +43,201 @@ ccl_device #else ccl_device_inline #endif -bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - LocalIntersection *local_isect, - int local_object, - uint *lcg_state, - int max_hits) + bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + LocalIntersection *local_isect, + int local_object, + uint *lcg_state, + int max_hits) { - /* todo: - * - test if pushing distance on the stack helps (for non shadow rays) - * - separate version for shadow rays - * - likely and unlikely for if() statements - * - test restrict attribute for pointers - */ + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - test restrict attribute for pointers + */ - /* traversal stack in CUDA thread-local memory */ - int traversal_stack[BVH_STACK_SIZE]; - traversal_stack[0] = ENTRYPOINT_SENTINEL; + /* traversal stack in CUDA thread-local memory */ + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; - /* traversal variables in registers */ - int stack_ptr = 0; - int node_addr = kernel_tex_fetch(__object_node, local_object); + /* traversal variables in registers */ + int stack_ptr = 0; + int node_addr = kernel_tex_fetch(__object_node, local_object); - /* ray parameters in registers */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = ray->t; + /* ray parameters in registers */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = ray->t; - if(local_isect != NULL) { - local_isect->num_hits = 0; - } - kernel_assert((local_isect == NULL) == (max_hits == 0)); + if (local_isect != NULL) { + local_isect->num_hits = 0; + } + kernel_assert((local_isect == NULL) == (max_hits == 0)); - const int object_flag = kernel_tex_fetch(__object_flag, local_object); - if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + const int object_flag = kernel_tex_fetch(__object_flag, local_object); + if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; - isect_t = bvh_instance_motion_push(kg, - local_object, - ray, - &P, - &dir, - &idir, - isect_t, - &ob_itfm); + Transform ob_itfm; + isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm); #else - isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); + isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); #endif - object = local_object; - } + object = local_object; + } #if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; # if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect_t); + ssef tnear(0.0f), tfar(isect_t); # endif - shuffle_swap_t shufflexyz[3]; + shuffle_swap_t shufflexyz[3]; - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); + ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - /* traversal loop */ - do { - do { - /* traverse internal nodes */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - int node_addr_child1, traverse_mask; - float dist[2]; - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + /* traversal loop */ + do { + do { + /* traverse internal nodes */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); #if !defined(__KERNEL_SSE2__) - traverse_mask = NODE_INTERSECT(kg, - P, + traverse_mask = NODE_INTERSECT(kg, + P, # if BVH_FEATURE(BVH_HAIR) - dir, + dir, # endif - idir, - isect_t, - node_addr, - PATH_RAY_ALL_VISIBILITY, - dist); + idir, + isect_t, + node_addr, + PATH_RAY_ALL_VISIBILITY, + dist); #else // __KERNEL_SSE2__ - traverse_mask = NODE_INTERSECT(kg, - P, - dir, + traverse_mask = NODE_INTERSECT(kg, + P, + dir, # if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, + tnear, + tfar, # endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - PATH_RAY_ALL_VISIBILITY, - dist); + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + PATH_RAY_ALL_VISIBILITY, + dist); #endif // __KERNEL_SSE2__ - node_addr = __float_as_int(cnodes.z); - node_addr_child1 = __float_as_int(cnodes.w); + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); - if(traverse_mask == 3) { - /* Both children were intersected, push the farther one. */ - bool is_closest_child1 = (dist[1] < dist[0]); - if(is_closest_child1) { - int tmp = node_addr; - node_addr = node_addr_child1; - node_addr_child1 = tmp; - } + if (traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if (is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; + } - ++stack_ptr; - kernel_assert(stack_ptr < BVH_STACK_SIZE); - traversal_stack[stack_ptr] = node_addr_child1; - } - else { - /* One child was intersected. */ - if(traverse_mask == 2) { - node_addr = node_addr_child1; - } - else if(traverse_mask == 0) { - /* Neither child was intersected. */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } - } - } + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; + } + else { + /* One child was intersected. */ + if (traverse_mask == 2) { + node_addr = node_addr_child1; + } + else if (traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } + } + } - /* if node is leaf, fetch triangle list */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - int prim_addr = __float_as_int(leaf.x); + /* if node is leaf, fetch triangle list */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + int prim_addr = __float_as_int(leaf.x); - const int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); + const int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); - /* pop */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; + /* pop */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; - /* primitive intersection */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - /* intersect ray against primitive */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(triangle_intersect_local(kg, - local_isect, - P, - dir, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) { - return true; - } - } - break; - } + /* primitive intersection */ + switch (type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + /* intersect ray against primitive */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (triangle_intersect_local(kg, + local_isect, + P, + dir, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits)) { + return true; + } + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - /* intersect ray against primitive */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(motion_triangle_intersect_local(kg, - local_isect, - P, - dir, - ray->time, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) { - return true; - } - } - break; - } + case PRIMITIVE_MOTION_TRIANGLE: { + /* intersect ray against primitive */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (motion_triangle_intersect_local(kg, + local_isect, + P, + dir, + ray->time, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits)) { + return true; + } + } + break; + } #endif - default: { - break; - } - } - } - } while(node_addr != ENTRYPOINT_SENTINEL); - } while(node_addr != ENTRYPOINT_SENTINEL); + default: { + break; + } + } + } + } while (node_addr != ENTRYPOINT_SENTINEL); + } while (node_addr != ENTRYPOINT_SENTINEL); - return false; + return false; } ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, @@ -254,35 +247,20 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, uint *lcg_state, int max_hits) { - switch(kernel_data.bvh.bvh_layout) { + switch (kernel_data.bvh.bvh_layout) { #ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, - ray, - local_isect, - local_object, - lcg_state, - max_hits); + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, local_isect, local_object, lcg_state, max_hits); #endif #ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - local_isect, - local_object, - lcg_state, - max_hits); + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, local_isect, local_object, lcg_state, max_hits); #endif - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - local_isect, - local_object, - lcg_state, - max_hits); - } - kernel_assert(!"Should not happen"); - return false; + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, local_isect, local_object, lcg_state, max_hits); + } + kernel_assert(!"Should not happen"); + return false; } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index 060b3934a41..042630121c8 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -20,12 +20,12 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k int node_addr, int child) { - Transform space; - const int child_addr = node_addr + child * 3; - space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1); - space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2); - space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3); - return space; + Transform space; + const int child_addr = node_addr + child * 3; + space.x = kernel_tex_fetch(__bvh_nodes, child_addr + 1); + space.y = kernel_tex_fetch(__bvh_nodes, child_addr + 2); + space.z = kernel_tex_fetch(__bvh_nodes, child_addr + 3); + return space; } #if !defined(__KERNEL_SSE2__) @@ -38,42 +38,41 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float dist[2]) { - /* fetch node data */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1); - float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2); - float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3); - - /* intersect ray against child nodes */ - float c0lox = (node0.x - P.x) * idir.x; - float c0hix = (node0.z - P.x) * idir.x; - float c0loy = (node1.x - P.y) * idir.y; - float c0hiy = (node1.z - P.y) * idir.y; - float c0loz = (node2.x - P.z) * idir.z; - float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); - float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); - - float c1lox = (node0.y - P.x) * idir.x; - float c1hix = (node0.w - P.x) * idir.x; - float c1loy = (node1.y - P.y) * idir.y; - float c1hiy = (node1.w - P.y) * idir.y; - float c1loz = (node2.y - P.z) * idir.z; - float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); - float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); - - dist[0] = c0min; - dist[1] = c1min; - -#ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); -#else - return ((c0max >= c0min)? 1: 0) | - ((c1max >= c1min)? 2: 0); -#endif + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); + + dist[0] = c0min; + dist[1] = c1min; + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); +# else + return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0); +# endif } ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, @@ -87,118 +86,115 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float dist[2]) { - /* fetch node data */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1); - float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2); - float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3); - - /* intersect ray against child nodes */ - float c0lox = (node0.x - P.x) * idir.x; - float c0hix = (node0.z - P.x) * idir.x; - float c0loy = (node1.x - P.y) * idir.y; - float c0hiy = (node1.z - P.y) * idir.y; - float c0loz = (node2.x - P.z) * idir.z; - float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); - float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); - - float c1lox = (node0.y - P.x) * idir.x; - float c1hix = (node0.w - P.x) * idir.x; - float c1loy = (node1.y - P.y) * idir.y; - float c1hiy = (node1.w - P.y) * idir.y; - float c1loz = (node2.y - P.z) * idir.z; - float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); - float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); - - if(difl != 0.0f) { - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } - } - - dist[0] = c0min; - dist[1] = c1min; - -#ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); -#else - return ((c0max >= c0min)? 1: 0) | - ((c1max >= c1min)? 2: 0); -#endif + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); + + if (difl != 0.0f) { + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if (__float_as_int(cnodes.z) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if (__float_as_int(cnodes.w) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + dist[0] = c0min; + dist[1] = c1min; + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); +# else + return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0); +# endif } -ccl_device_forceinline bool bvh_unaligned_node_intersect_child( - KernelGlobals *kg, - const float3 P, - const float3 dir, - const float t, - int node_addr, - int child, - float dist[2]) +ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + int node_addr, + int child, + float dist[2]) { - Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); - float3 aligned_dir = transform_direction(&space, dir); - float3 aligned_P = transform_point(&space, P); - float3 nrdir = -bvh_inverse_direction(aligned_dir); - float3 lower_xyz = aligned_P * nrdir; - float3 upper_xyz = lower_xyz - nrdir; - const float near_x = min(lower_xyz.x, upper_xyz.x); - const float near_y = min(lower_xyz.y, upper_xyz.y); - const float near_z = min(lower_xyz.z, upper_xyz.z); - const float far_x = max(lower_xyz.x, upper_xyz.x); - const float far_y = max(lower_xyz.y, upper_xyz.y); - const float far_z = max(lower_xyz.z, upper_xyz.z); - const float tnear = max4(0.0f, near_x, near_y, near_z); - const float tfar = min4(t, far_x, far_y, far_z); - *dist = tnear; - return tnear <= tfar; + Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 lower_xyz = aligned_P * nrdir; + float3 upper_xyz = lower_xyz - nrdir; + const float near_x = min(lower_xyz.x, upper_xyz.x); + const float near_y = min(lower_xyz.y, upper_xyz.y); + const float near_z = min(lower_xyz.z, upper_xyz.z); + const float far_x = max(lower_xyz.x, upper_xyz.x); + const float far_y = max(lower_xyz.y, upper_xyz.y); + const float far_z = max(lower_xyz.z, upper_xyz.z); + const float tnear = max4(0.0f, near_x, near_y, near_z); + const float tfar = min4(t, far_x, far_y, far_z); + *dist = tnear; + return tnear <= tfar; } -ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust( - KernelGlobals *kg, - const float3 P, - const float3 dir, - const float t, - const float difl, - int node_addr, - int child, - float dist[2]) +ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + const float difl, + int node_addr, + int child, + float dist[2]) { - Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); - float3 aligned_dir = transform_direction(&space, dir); - float3 aligned_P = transform_point(&space, P); - float3 nrdir = -bvh_inverse_direction(aligned_dir); - float3 tLowerXYZ = aligned_P * nrdir; - float3 tUpperXYZ = tLowerXYZ - nrdir; - const float near_x = min(tLowerXYZ.x, tUpperXYZ.x); - const float near_y = min(tLowerXYZ.y, tUpperXYZ.y); - const float near_z = min(tLowerXYZ.z, tUpperXYZ.z); - const float far_x = max(tLowerXYZ.x, tUpperXYZ.x); - const float far_y = max(tLowerXYZ.y, tUpperXYZ.y); - const float far_z = max(tLowerXYZ.z, tUpperXYZ.z); - const float tnear = max4(0.0f, near_x, near_y, near_z); - const float tfar = min4(t, far_x, far_y, far_z); - *dist = tnear; - if(difl != 0.0f) { - /* TODO(sergey): Same as for QBVH, needs a proper use. */ - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - return round_down*tnear <= round_up*tfar; - } - else { - return tnear <= tfar; - } + Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 tLowerXYZ = aligned_P * nrdir; + float3 tUpperXYZ = tLowerXYZ - nrdir; + const float near_x = min(tLowerXYZ.x, tUpperXYZ.x); + const float near_y = min(tLowerXYZ.y, tUpperXYZ.y); + const float near_z = min(tLowerXYZ.z, tUpperXYZ.z); + const float far_x = max(tLowerXYZ.x, tUpperXYZ.x); + const float far_y = max(tLowerXYZ.y, tUpperXYZ.y); + const float far_z = max(tLowerXYZ.z, tUpperXYZ.z); + const float tnear = max4(0.0f, near_x, near_y, near_z); + const float tfar = min4(t, far_x, far_y, far_z); + *dist = tnear; + if (difl != 0.0f) { + /* TODO(sergey): Same as for QBVH, needs a proper use. */ + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + return round_down * tnear <= round_up * tfar; + } + else { + return tnear <= tfar; + } } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, @@ -210,25 +206,25 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, const uint visibility, float dist[2]) { - int mask = 0; - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) { -#ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(cnodes.x) & visibility)) -#endif - { - mask |= 1; - } - } - if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) { -#ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(cnodes.y) & visibility)) -#endif - { - mask |= 2; - } - } - return mask; + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) { +# ifdef __VISIBILITY_FLAG__ + if ((__float_as_uint(cnodes.x) & visibility)) +# endif + { + mask |= 1; + } + } + if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) { +# ifdef __VISIBILITY_FLAG__ + if ((__float_as_uint(cnodes.y) & visibility)) +# endif + { + mask |= 2; + } + } + return mask; } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, @@ -242,25 +238,25 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg const uint visibility, float dist[2]) { - int mask = 0; - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) { -#ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(cnodes.x) & visibility)) -#endif - { - mask |= 1; - } - } - if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) { -#ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(cnodes.y) & visibility)) -#endif - { - mask |= 2; - } - } - return mask; + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + if (bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) { +# ifdef __VISIBILITY_FLAG__ + if ((__float_as_uint(cnodes.x) & visibility)) +# endif + { + mask |= 1; + } + } + if (bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) { +# ifdef __VISIBILITY_FLAG__ + if ((__float_as_uint(cnodes.y) & visibility)) +# endif + { + mask |= 2; + } + } + return mask; } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, @@ -272,26 +268,13 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, const uint visibility, float dist[2]) { - float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect(kg, - P, - dir, - idir, - t, - node_addr, - visibility, - dist); - } - else { - return bvh_aligned_node_intersect(kg, - P, - idir, - t, - node_addr, - visibility, - dist); - } + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect(kg, P, dir, idir, t, node_addr, visibility, dist); + } + else { + return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist); + } } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, @@ -305,279 +288,244 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, const uint visibility, float dist[2]) { - float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect_robust(kg, - P, - dir, - idir, - t, - difl, - extmax, - node_addr, - visibility, - dist); - } - else { - return bvh_aligned_node_intersect_robust(kg, - P, - idir, - t, - difl, - extmax, - node_addr, - visibility, - dist); - } + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust( + kg, P, dir, idir, t, difl, extmax, node_addr, visibility, dist); + } + else { + return bvh_aligned_node_intersect_robust( + kg, P, idir, t, difl, extmax, node_addr, visibility, dist); + } } -#else /* !defined(__KERNEL_SSE2__) */ - -int ccl_device_forceinline bvh_aligned_node_intersect( - KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const int node_addr, - const uint visibility, - float dist[2]) +#else /* !defined(__KERNEL_SSE2__) */ + +int ccl_device_forceinline bvh_aligned_node_intersect(KernelGlobals *kg, + const float3 &P, + const float3 &dir, + const ssef &tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int node_addr, + const uint visibility, + float dist[2]) { - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + node_addr; + /* fetch node data */ + const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + node_addr; - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - dist[0] = tminmax[0]; - dist[1] = tminmax[1]; + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; - int mask = movemask(lrhit); + int mask = movemask(lrhit); # ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); - return cmask; + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); + return cmask; # else - return mask & 3; + return mask & 3; # endif } -ccl_device_forceinline int bvh_aligned_node_intersect_robust( - KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const float difl, - const float extmax, - const int nodeAddr, - const uint visibility, - float dist[2]) +ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, + const float3 &P, + const float3 &dir, + const ssef &tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int nodeAddr, + const uint visibility, + float dist[2]) { - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - - if(difl != 0.0f) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); - float4 *tminmaxview = (float4*)&tminmax; - float& c0min = tminmaxview->x, &c1min = tminmaxview->y; - float& c0max = tminmaxview->z, &c1max = tminmaxview->w; - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } - } - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - dist[0] = tminmax[0]; - dist[1] = tminmax[1]; - - int mask = movemask(lrhit); + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + + /* fetch node data */ + const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + nodeAddr; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + + if (difl != 0.0f) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr + 0); + float4 *tminmaxview = (float4 *)&tminmax; + float &c0min = tminmaxview->x, &c1min = tminmaxview->y; + float &c0max = tminmaxview->z, &c1max = tminmaxview->w; + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if (__float_as_int(cnodes.x) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if (__float_as_int(cnodes.y) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; + + int mask = movemask(lrhit); # ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); - return cmask; + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr + 0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); + return cmask; # else - return mask & 3; + return mask & 3; # endif } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, const float3 P, const float3 dir, - const ssef& isect_near, - const ssef& isect_far, + const ssef &isect_near, + const ssef &isect_far, const int node_addr, const uint visibility, float dist[2]) { - Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); - Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); - - float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir); - float3 aligned_P0 = transform_point(&space0, P), - aligned_P1 = transform_point(&space1, P); - float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), - nrdir1 = -bvh_inverse_direction(aligned_dir1); - - ssef lower_x = ssef(aligned_P0.x * nrdir0.x, - aligned_P1.x * nrdir1.x, - 0.0f, 0.0f), - lower_y = ssef(aligned_P0.y * nrdir0.y, - aligned_P1.y * nrdir1.y, - 0.0f, - 0.0f), - lower_z = ssef(aligned_P0.z * nrdir0.z, - aligned_P1.z * nrdir1.z, - 0.0f, - 0.0f); - - ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), - upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), - upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); - - ssef tnear_x = min(lower_x, upper_x); - ssef tnear_y = min(lower_y, upper_y); - ssef tnear_z = min(lower_z, upper_z); - ssef tfar_x = max(lower_x, upper_x); - ssef tfar_y = max(lower_y, upper_y); - ssef tfar_z = max(lower_z, upper_z); - - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - sseb vmask = tnear <= tfar; - dist[0] = tnear.f[0]; - dist[1] = tnear.f[1]; - - int mask = (int)movemask(vmask); + Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir); + float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f), + lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f), + lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f); + + ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(lower_x, upper_x); + ssef tnear_y = min(lower_y, upper_y); + ssef tnear_z = min(lower_z, upper_z); + ssef tfar_x = max(lower_x, upper_x); + ssef tfar_y = max(lower_y, upper_y); + ssef tfar_z = max(lower_z, upper_z); + + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + sseb vmask = tnear <= tfar; + dist[0] = tnear.f[0]; + dist[1] = tnear.f[1]; + + int mask = (int)movemask(vmask); # ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); - return cmask; + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); + return cmask; # else - return mask & 3; + return mask & 3; # endif } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, const float3 P, const float3 dir, - const ssef& isect_near, - const ssef& isect_far, + const ssef &isect_near, + const ssef &isect_far, const float difl, const int node_addr, const uint visibility, float dist[2]) { - Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); - Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); - - float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir); - float3 aligned_P0 = transform_point(&space0, P), - aligned_P1 = transform_point(&space1, P); - float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), - nrdir1 = -bvh_inverse_direction(aligned_dir1); - - ssef lower_x = ssef(aligned_P0.x * nrdir0.x, - aligned_P1.x * nrdir1.x, - 0.0f, 0.0f), - lower_y = ssef(aligned_P0.y * nrdir0.y, - aligned_P1.y * nrdir1.y, - 0.0f, - 0.0f), - lower_z = ssef(aligned_P0.z * nrdir0.z, - aligned_P1.z * nrdir1.z, - 0.0f, - 0.0f); - - ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), - upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), - upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); - - ssef tnear_x = min(lower_x, upper_x); - ssef tnear_y = min(lower_y, upper_y); - ssef tnear_z = min(lower_z, upper_z); - ssef tfar_x = max(lower_x, upper_x); - ssef tfar_y = max(lower_y, upper_y); - ssef tfar_z = max(lower_z, upper_z); - - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - sseb vmask; - if(difl != 0.0f) { - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - vmask = round_down*tnear <= round_up*tfar; - } - else { - vmask = tnear <= tfar; - } - - dist[0] = tnear.f[0]; - dist[1] = tnear.f[1]; - - int mask = (int)movemask(vmask); + Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir); + float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f), + lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f), + lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f); + + ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(lower_x, upper_x); + ssef tnear_y = min(lower_y, upper_y); + ssef tnear_z = min(lower_z, upper_z); + ssef tfar_x = max(lower_x, upper_x); + ssef tfar_y = max(lower_y, upper_y); + ssef tfar_z = max(lower_z, upper_z); + + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + sseb vmask; + if (difl != 0.0f) { + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + vmask = round_down * tnear <= round_up * tfar; + } + else { + vmask = tnear <= tfar; + } + + dist[0] = tnear.f[0]; + dist[1] = tnear.f[1]; + + int mask = (int)movemask(vmask); # ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); - return cmask; + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); + return cmask; # else - return mask & 3; + return mask & 3; # endif } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, + const float3 &P, + const float3 &dir, + const ssef &isect_near, + const ssef &isect_far, + const ssef &tsplat, const ssef Psplat[3], const ssef idirsplat[3], const shuffle_swap_t shufflexyz[3], @@ -585,37 +533,23 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, const uint visibility, float dist[2]) { - float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect(kg, - P, - dir, - isect_near, - isect_far, - node_addr, - visibility, - dist); - } - else { - return bvh_aligned_node_intersect(kg, - P, - dir, - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); - } + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect( + kg, P, dir, isect_near, isect_far, node_addr, visibility, dist); + } + else { + return bvh_aligned_node_intersect( + kg, P, dir, tsplat, Psplat, idirsplat, shufflexyz, node_addr, visibility, dist); + } } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, + const float3 &P, + const float3 &dir, + const ssef &isect_near, + const ssef &isect_far, + const ssef &tsplat, const ssef Psplat[3], const ssef idirsplat[3], const shuffle_swap_t shufflexyz[3], @@ -625,31 +559,24 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, const uint visibility, float dist[2]) { - float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect_robust(kg, - P, - dir, - isect_near, - isect_far, - difl, - node_addr, - visibility, - dist); - } - else { - return bvh_aligned_node_intersect_robust(kg, - P, - dir, - tsplat, - Psplat, - idirsplat, - shufflexyz, - difl, - extmax, - node_addr, - visibility, - dist); - } + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust( + kg, P, dir, isect_near, isect_far, difl, node_addr, visibility, dist); + } + else { + return bvh_aligned_node_intersect_robust(kg, + P, + dir, + tsplat, + Psplat, + idirsplat, + shufflexyz, + difl, + extmax, + node_addr, + visibility, + dist); + } } -#endif /* !defined(__KERNEL_SSE2__) */ +#endif /* !defined(__KERNEL_SSE2__) */ diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index d8e089711ee..b362779549c 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -19,9 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_shadow_all.h" -#ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_shadow_all.h" -#endif +# ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_shadow_all.h" +# endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -44,350 +44,340 @@ ccl_device #else ccl_device_inline #endif -bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const uint visibility, - const uint max_hits, - uint *num_hits) + bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint visibility, + const uint max_hits, + uint *num_hits) { - /* todo: - * - likely and unlikely for if() statements - * - test restrict attribute for pointers - */ - - /* traversal stack in CUDA thread-local memory */ - int traversal_stack[BVH_STACK_SIZE]; - traversal_stack[0] = ENTRYPOINT_SENTINEL; - - /* traversal variables in registers */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* ray parameters in registers */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; + /* todo: + * - likely and unlikely for if() statements + * - test restrict attribute for pointers + */ + + /* traversal stack in CUDA thread-local memory */ + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; + + /* traversal variables in registers */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* ray parameters in registers */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif #if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; + int num_hits_in_instance = 0; #endif - *num_hits = 0; - isect_array->t = tmax; + *num_hits = 0; + isect_array->t = tmax; #if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; # if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect_t); + ssef tnear(0.0f), tfar(isect_t); # endif - shuffle_swap_t shufflexyz[3]; + shuffle_swap_t shufflexyz[3]; - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); + ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif /* __KERNEL_SSE2__ */ + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif /* __KERNEL_SSE2__ */ - /* traversal loop */ - do { - do { - /* traverse internal nodes */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - int node_addr_child1, traverse_mask; - float dist[2]; - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + /* traversal loop */ + do { + do { + /* traverse internal nodes */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); #if !defined(__KERNEL_SSE2__) - traverse_mask = NODE_INTERSECT(kg, - P, + traverse_mask = NODE_INTERSECT(kg, + P, # if BVH_FEATURE(BVH_HAIR) - dir, + dir, # endif - idir, - isect_t, - node_addr, - visibility, - dist); + idir, + isect_t, + node_addr, + visibility, + dist); #else // __KERNEL_SSE2__ - traverse_mask = NODE_INTERSECT(kg, - P, - dir, + traverse_mask = NODE_INTERSECT(kg, + P, + dir, # if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, + tnear, + tfar, # endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); #endif // __KERNEL_SSE2__ - node_addr = __float_as_int(cnodes.z); - node_addr_child1 = __float_as_int(cnodes.w); - - if(traverse_mask == 3) { - /* Both children were intersected, push the farther one. */ - bool is_closest_child1 = (dist[1] < dist[0]); - if(is_closest_child1) { - int tmp = node_addr; - node_addr = node_addr_child1; - node_addr_child1 = tmp; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_STACK_SIZE); - traversal_stack[stack_ptr] = node_addr_child1; - } - else { - /* One child was intersected. */ - if(traverse_mask == 2) { - node_addr = node_addr_child1; - } - else if(traverse_mask == 0) { - /* Neither child was intersected. */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } - } - } - - /* if node is leaf, fetch triangle list */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - int prim_addr = __float_as_int(leaf.x); + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); + + if (traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if (is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; + } + else { + /* One child was intersected. */ + if (traverse_mask == 2) { + node_addr = node_addr_child1; + } + else if (traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } + } + } + + /* if node is leaf, fetch triangle list */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - const int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - - /* pop */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - - /* primitive intersection */ - while(prim_addr < prim_addr2) { - kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - bool hit; - - /* todo: specialized intersect functions which don't fill in - * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? - * might give a few % performance improvement */ - - switch(p_type) { - case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr); - break; - } + const int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + + /* pop */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + + /* primitive intersection */ + while (prim_addr < prim_addr2) { + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); + bool hit; + + /* todo: specialized intersect functions which don't fill in + * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? + * might give a few % performance improvement */ + + switch (p_type) { + case PRIMITIVE_TRIANGLE: { + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect(kg, - isect_array, - P, - dir, - ray->time, - visibility, - object, - prim_addr); - break; - } + case PRIMITIVE_MOTION_TRIANGLE: { + hit = motion_triangle_intersect( + kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + break; + } #endif #if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); - } - else { - hit = curve_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); - } - break; - } + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, + 0); + } + else { + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, + 0); + } + break; + } #endif - default: { - hit = false; - break; - } - } + default: { + hit = false; + break; + } + } - /* shadow ray early termination */ - if(hit) { - /* detect if this surface has a shader with transparent shadows */ + /* shadow ray early termination */ + if (hit) { + /* detect if this surface has a shader with transparent shadows */ - /* todo: optimize so primitive visibility flag indicates if - * the primitive has a transparent shadow shader? */ - int prim = kernel_tex_fetch(__prim_index, isect_array->prim); - int shader = 0; + /* todo: optimize so primitive visibility flag indicates if + * the primitive has a transparent shadow shader? */ + int prim = kernel_tex_fetch(__prim_index, isect_array->prim); + int shader = 0; #ifdef __HAIR__ - if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) + if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) #endif - { - shader = kernel_tex_fetch(__tri_shader, prim); - } + { + shader = kernel_tex_fetch(__tri_shader, prim); + } #ifdef __HAIR__ - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } #endif - int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - - /* if no transparent shadows, all light is blocked */ - if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { - return true; - } - /* if maximum number of hits reached, block all light */ - else if(*num_hits == max_hits) { - return true; - } - - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + + /* if no transparent shadows, all light is blocked */ + if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return true; + } + /* if maximum number of hits reached, block all light */ + else if (*num_hits == max_hits) { + return true; + } + + /* move on to next entry in intersections array */ + isect_array++; + (*num_hits)++; #if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; #endif - isect_array->t = isect_t; - } + isect_array->t = isect_t; + } - prim_addr++; - } - } + prim_addr++; + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* instance push */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); + else { + /* instance push */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); # if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - num_hits_in_instance = 0; - isect_array->t = isect_t; + num_hits_in_instance = 0; + isect_array->t = isect_t; # if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect_t); + tfar = ssef(isect_t); # endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_STACK_SIZE); - traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - node_addr = kernel_tex_fetch(__object_node, object); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + node_addr = kernel_tex_fetch(__object_node, object); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ - if(num_hits_in_instance) { - float t_fac; + /* Instance pop. */ + if (num_hits_in_instance) { + float t_fac; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); # else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - /* scale isect->t to adjust for instancing */ - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } - else { + /* scale isect->t to adjust for instancing */ + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } + else { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - } + } - isect_t = tmax; - isect_array->t = isect_t; + isect_t = tmax; + isect_array->t = isect_t; # if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect_t); + tfar = ssef(isect_t); # endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return false; + return false; } ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, @@ -397,35 +387,20 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const uint max_hits, uint *num_hits) { - switch(kernel_data.bvh.bvh_layout) { + switch (kernel_data.bvh.bvh_layout) { #ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, - ray, - isect_array, - visibility, - max_hits, - num_hits); + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect_array, visibility, max_hits, num_hits); #endif #ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - isect_array, - visibility, - max_hits, - num_hits); + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, visibility, max_hits, num_hits); #endif - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect_array, - visibility, - max_hits, - num_hits); - } - kernel_assert(!"Should not happen"); - return false; + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits); + } + kernel_assert(!"Should not happen"); + return false; } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 76d4cab663d..34a06d003bb 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -47,374 +47,362 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Intersection *isect, const uint visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , uint *lcg_state, + , + uint *lcg_state, float difl, float extmax #endif - ) +) { - /* todo: - * - test if pushing distance on the stack helps (for non shadow rays) - * - separate version for shadow rays - * - likely and unlikely for if() statements - * - test restrict attribute for pointers - */ - - /* traversal stack in CUDA thread-local memory */ - int traversal_stack[BVH_STACK_SIZE]; - traversal_stack[0] = ENTRYPOINT_SENTINEL; - - /* traversal variables in registers */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* ray parameters in registers */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - test restrict attribute for pointers + */ + + /* traversal stack in CUDA thread-local memory */ + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; + + /* traversal variables in registers */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* ray parameters in registers */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; - BVH_DEBUG_INIT(); + BVH_DEBUG_INIT(); #if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; # if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect->t); + ssef tnear(0.0f), tfar(isect->t); # endif - shuffle_swap_t shufflexyz[3]; + shuffle_swap_t shufflexyz[3]; - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); + ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - /* traversal loop */ - do { - do { - /* traverse internal nodes */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - int node_addr_child1, traverse_mask; - float dist[2]; - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + /* traversal loop */ + do { + do { + /* traverse internal nodes */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); #if !defined(__KERNEL_SSE2__) # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - if(difl != 0.0f) { - traverse_mask = NODE_INTERSECT_ROBUST(kg, - P, + if (difl != 0.0f) { + traverse_mask = NODE_INTERSECT_ROBUST(kg, + P, # if BVH_FEATURE(BVH_HAIR) - dir, + dir, # endif - idir, - isect->t, - difl, - extmax, - node_addr, - visibility, - dist); - } - else + idir, + isect->t, + difl, + extmax, + node_addr, + visibility, + dist); + } + else # endif - { - traverse_mask = NODE_INTERSECT(kg, - P, -# if BVH_FEATURE(BVH_HAIR) - dir, -# endif - idir, - isect->t, - node_addr, - visibility, - dist); - } + { + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + node_addr, + visibility, + dist); + } #else // __KERNEL_SSE2__ # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - if(difl != 0.0f) { - traverse_mask = NODE_INTERSECT_ROBUST(kg, - P, - dir, + if (difl != 0.0f) { + traverse_mask = NODE_INTERSECT_ROBUST(kg, + P, + dir, # if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, + tnear, + tfar, # endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - difl, - extmax, - node_addr, - visibility, - dist); - } - else + tsplat, + Psplat, + idirsplat, + shufflexyz, + difl, + extmax, + node_addr, + visibility, + dist); + } + else # endif - { - traverse_mask = NODE_INTERSECT(kg, - P, - dir, -# if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, -# endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); - } + { + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); + } #endif // __KERNEL_SSE2__ - node_addr = __float_as_int(cnodes.z); - node_addr_child1 = __float_as_int(cnodes.w); - - if(traverse_mask == 3) { - /* Both children were intersected, push the farther one. */ - bool is_closest_child1 = (dist[1] < dist[0]); - if(is_closest_child1) { - int tmp = node_addr; - node_addr = node_addr_child1; - node_addr_child1 = tmp; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_STACK_SIZE); - traversal_stack[stack_ptr] = node_addr_child1; - } - else { - /* One child was intersected. */ - if(traverse_mask == 2) { - node_addr = node_addr_child1; - } - else if(traverse_mask == 0) { - /* Neither child was intersected. */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } - } - BVH_DEBUG_NEXT_NODE(); - } - - /* if node is leaf, fetch triangle list */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - int prim_addr = __float_as_int(leaf.x); + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); + + if (traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if (is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; + } + else { + /* One child was intersected. */ + if (traverse_mask == 2) { + node_addr = node_addr_child1; + } + else if (traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } + } + BVH_DEBUG_NEXT_NODE(); + } + + /* if node is leaf, fetch triangle list */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - const int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* pop */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - - /* primitive intersection */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(triangle_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr)) - { - /* shadow ray early termination */ + const int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* pop */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + + /* primitive intersection */ + switch (type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) { + /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) - if(visibility & PATH_RAY_SHADOW_OPAQUE) - return true; - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + if (visibility & PATH_RAY_SHADOW_OPAQUE) + return true; + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); + tfar = ssef(isect->t); # endif #else - if(visibility & PATH_RAY_SHADOW_OPAQUE) - return true; + if (visibility & PATH_RAY_SHADOW_OPAQUE) + return true; #endif - } - } - break; - } + } + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(motion_triangle_intersect(kg, - isect, - P, - dir, - ray->time, - visibility, - object, - prim_addr)) - { - /* shadow ray early termination */ + case PRIMITIVE_MOTION_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (motion_triangle_intersect( + kg, isect, P, dir, ray->time, visibility, object, prim_addr)) { + /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility & PATH_RAY_SHADOW_OPAQUE) - return true; - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + if (visibility & PATH_RAY_SHADOW_OPAQUE) + return true; + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); + tfar = ssef(isect->t); # endif # else - if(visibility & PATH_RAY_SHADOW_OPAQUE) - return true; + if (visibility & PATH_RAY_SHADOW_OPAQUE) + return true; # endif - } - } - break; - } -#endif /* BVH_FEATURE(BVH_MOTION) */ + } + } + break; + } +#endif /* BVH_FEATURE(BVH_MOTION) */ #if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); - bool hit; - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); - } - else { - hit = curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); - } - if(hit) { - /* shadow ray early termination */ + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + for (; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); + bool hit; + if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + else { + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + if (hit) { + /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility & PATH_RAY_SHADOW_OPAQUE) - return true; - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + if (visibility & PATH_RAY_SHADOW_OPAQUE) + return true; + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); + tfar = ssef(isect->t); # endif # else - if(visibility & PATH_RAY_SHADOW_OPAQUE) - return true; + if (visibility & PATH_RAY_SHADOW_OPAQUE) + return true; # endif - } - } - break; - } -#endif /* BVH_FEATURE(BVH_HAIR) */ - } - } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_HAIR) */ + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* instance push */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); + else { + /* instance push */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif # if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); + tfar = ssef(isect->t); # endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_STACK_SIZE); - traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - node_addr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); - BVH_DEBUG_NEXT_INSTANCE(); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + BVH_DEBUG_NEXT_INSTANCE(); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* instance pop */ + /* instance pop */ # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif # if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); + tfar = ssef(isect->t); # endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return (isect->prim != PRIM_NONE); + return (isect->prim != PRIM_NONE); } ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, @@ -422,53 +410,57 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, Intersection *isect, const uint visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , uint *lcg_state, + , + uint *lcg_state, float difl, float extmax #endif - ) +) { - switch(kernel_data.bvh.bvh_layout) { + switch (kernel_data.bvh.bvh_layout) { #ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, - ray, - isect, - visibility + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + isect, + visibility # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , lcg_state, - difl, - extmax + , + lcg_state, + difl, + extmax # endif - ); + ); #endif #ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - isect, - visibility + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect, + visibility # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , lcg_state, - difl, - extmax + , + lcg_state, + difl, + extmax # endif - ); -#endif /* __QBVH__ */ - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect, - visibility + ); +#endif /* __QBVH__ */ + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect, + visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , lcg_state, - difl, - extmax + , + lcg_state, + difl, + extmax #endif - ); - } - kernel_assert(!"Should not happen"); - return false; + ); + } + kernel_assert(!"Should not happen"); + return false; } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h index 4ca0dc2225e..16f3b03f842 100644 --- a/intern/cycles/kernel/bvh/bvh_types.h +++ b/intern/cycles/kernel/bvh/bvh_types.h @@ -35,13 +35,13 @@ CCL_NAMESPACE_BEGIN #define BVH_OSTACK_SIZE 768 /* BVH intersection function variations */ -#define BVH_INSTANCING 1 -#define BVH_MOTION 2 -#define BVH_HAIR 4 -#define BVH_HAIR_MINIMUM_WIDTH 8 +#define BVH_INSTANCING 1 +#define BVH_MOTION 2 +#define BVH_HAIR 4 +#define BVH_HAIR_MINIMUM_WIDTH 8 -#define BVH_NAME_JOIN(x,y) x ## _ ## y -#define BVH_NAME_EVAL(x,y) BVH_NAME_JOIN(x,y) +#define BVH_NAME_JOIN(x, y) x##_##y +#define BVH_NAME_EVAL(x, y) BVH_NAME_JOIN(x, y) #define BVH_FUNCTION_FULL_NAME(prefix) BVH_NAME_EVAL(prefix, BVH_FUNCTION_NAME) #define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0) @@ -49,30 +49,30 @@ CCL_NAMESPACE_BEGIN /* Debugging heleprs */ #ifdef __KERNEL_DEBUG__ # define BVH_DEBUG_INIT() \ - do { \ - isect->num_traversed_nodes = 0; \ - isect->num_traversed_instances = 0; \ - isect->num_intersections = 0; \ - } while(0) + do { \ + isect->num_traversed_nodes = 0; \ + isect->num_traversed_instances = 0; \ + isect->num_intersections = 0; \ + } while (0) # define BVH_DEBUG_NEXT_NODE() \ - do { \ - ++isect->num_traversed_nodes; \ - } while(0) + do { \ + ++isect->num_traversed_nodes; \ + } while (0) # define BVH_DEBUG_NEXT_INTERSECTION() \ - do { \ - ++isect->num_intersections; \ - } while(0) + do { \ + ++isect->num_intersections; \ + } while (0) # define BVH_DEBUG_NEXT_INSTANCE() \ - do { \ - ++isect->num_traversed_instances; \ - } while(0) -#else /* __KERNEL_DEBUG__ */ + do { \ + ++isect->num_traversed_instances; \ + } while (0) +#else /* __KERNEL_DEBUG__ */ # define BVH_DEBUG_INIT() # define BVH_DEBUG_NEXT_NODE() # define BVH_DEBUG_NEXT_INTERSECTION() # define BVH_DEBUG_NEXT_INSTANCE() -#endif /* __KERNEL_DEBUG__ */ +#endif /* __KERNEL_DEBUG__ */ CCL_NAMESPACE_END -#endif /* __BVH_TYPES__ */ +#endif /* __BVH_TYPES__ */ diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index b8257e3493e..c83b0d783f4 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -19,9 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_volume.h" -#ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_volume.h" -#endif +# ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_volume.h" +# endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -43,267 +43,260 @@ ccl_device #else ccl_device_inline #endif -bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility) + bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility) { - /* todo: - * - test if pushing distance on the stack helps (for non shadow rays) - * - separate version for shadow rays - * - likely and unlikely for if() statements - * - test restrict attribute for pointers - */ - - /* traversal stack in CUDA thread-local memory */ - int traversal_stack[BVH_STACK_SIZE]; - traversal_stack[0] = ENTRYPOINT_SENTINEL; - - /* traversal variables in registers */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* ray parameters in registers */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - test restrict attribute for pointers + */ + + /* traversal stack in CUDA thread-local memory */ + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; + + /* traversal variables in registers */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* ray parameters in registers */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; #if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; # if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect->t); + ssef tnear(0.0f), tfar(isect->t); # endif - shuffle_swap_t shufflexyz[3]; + shuffle_swap_t shufflexyz[3]; - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); + ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - /* traversal loop */ - do { - do { - /* traverse internal nodes */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - int node_addr_child1, traverse_mask; - float dist[2]; - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + /* traversal loop */ + do { + do { + /* traverse internal nodes */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); #if !defined(__KERNEL_SSE2__) - traverse_mask = NODE_INTERSECT(kg, - P, + traverse_mask = NODE_INTERSECT(kg, + P, # if BVH_FEATURE(BVH_HAIR) - dir, + dir, # endif - idir, - isect->t, - node_addr, - visibility, - dist); + idir, + isect->t, + node_addr, + visibility, + dist); #else // __KERNEL_SSE2__ - traverse_mask = NODE_INTERSECT(kg, - P, - dir, + traverse_mask = NODE_INTERSECT(kg, + P, + dir, # if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, + tnear, + tfar, # endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); #endif // __KERNEL_SSE2__ - node_addr = __float_as_int(cnodes.z); - node_addr_child1 = __float_as_int(cnodes.w); - - if(traverse_mask == 3) { - /* Both children were intersected, push the farther one. */ - bool is_closest_child1 = (dist[1] < dist[0]); - if(is_closest_child1) { - int tmp = node_addr; - node_addr = node_addr_child1; - node_addr_child1 = tmp; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_STACK_SIZE); - traversal_stack[stack_ptr] = node_addr_child1; - } - else { - /* One child was intersected. */ - if(traverse_mask == 2) { - node_addr = node_addr_child1; - } - else if(traverse_mask == 0) { - /* Neither child was intersected. */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } - } - } - - /* if node is leaf, fetch triangle list */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - int prim_addr = __float_as_int(leaf.x); + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); + + if (traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if (is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; + } + else { + /* One child was intersected. */ + if (traverse_mask == 2) { + node_addr = node_addr_child1; + } + else if (traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } + } + } + + /* if node is leaf, fetch triangle list */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - const int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* pop */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - - /* primitive intersection */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - /* intersect ray against primitive */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - triangle_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr); - } - break; - } + const int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* pop */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + + /* primitive intersection */ + switch (type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + /* intersect ray against primitive */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* only primitives from volume object */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - /* intersect ray against primitive */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - motion_triangle_intersect(kg, - isect, - P, - dir, - ray->time, - visibility, - object, - prim_addr); - } - break; - } + case PRIMITIVE_MOTION_TRIANGLE: { + /* intersect ray against primitive */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* only primitives from volume object */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + motion_triangle_intersect( + kg, isect, P, dir, ray->time, visibility, object, prim_addr); + } + break; + } #endif - default: { - break; - } - } - } + default: { + break; + } + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* instance push */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { + else { + /* instance push */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if (object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif # if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); + tfar = ssef(isect->t); # endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_STACK_SIZE); - traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* pop */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* pop */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* instance pop */ + /* instance pop */ # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif # if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); + tfar = ssef(isect->t); # endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } -#endif /* FEATURE(BVH_MOTION) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } +#endif /* FEATURE(BVH_MOTION) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return (isect->prim != PRIM_NONE); + return (isect->prim != PRIM_NONE); } ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, @@ -311,29 +304,20 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, Intersection *isect, const uint visibility) { - switch(kernel_data.bvh.bvh_layout) { + switch (kernel_data.bvh.bvh_layout) { #ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, - ray, - isect, - visibility); + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect, visibility); #endif #ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - isect, - visibility); + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect, visibility); #endif - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect, - visibility); - } - kernel_assert(!"Should not happen"); - return false; + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility); + } + kernel_assert(!"Should not happen"); + return false; } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index f3ca4058460..ae8c4d12e8a 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -19,9 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_volume_all.h" -#ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_volume_all.h" -#endif +# ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_volume_all.h" +# endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -43,342 +43,337 @@ ccl_device #else ccl_device_inline #endif -uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const uint max_hits, - const uint visibility) + uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits, + const uint visibility) { - /* todo: - * - test if pushing distance on the stack helps (for non shadow rays) - * - separate version for shadow rays - * - likely and unlikely for if() statements - * - test restrict attribute for pointers - */ - - /* traversal stack in CUDA thread-local memory */ - int traversal_stack[BVH_STACK_SIZE]; - traversal_stack[0] = ENTRYPOINT_SENTINEL; - - /* traversal variables in registers */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* ray parameters in registers */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - test restrict attribute for pointers + */ + + /* traversal stack in CUDA thread-local memory */ + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; + + /* traversal variables in registers */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* ray parameters in registers */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif #if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; + int num_hits_in_instance = 0; #endif - uint num_hits = 0; - isect_array->t = tmax; + uint num_hits = 0; + isect_array->t = tmax; #if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; # if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect_t); + ssef tnear(0.0f), tfar(isect_t); # endif - shuffle_swap_t shufflexyz[3]; + shuffle_swap_t shufflexyz[3]; - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); + ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif /* __KERNEL_SSE2__ */ + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif /* __KERNEL_SSE2__ */ - /* traversal loop */ - do { - do { - /* traverse internal nodes */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - int node_addr_child1, traverse_mask; - float dist[2]; - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + /* traversal loop */ + do { + do { + /* traverse internal nodes */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); #if !defined(__KERNEL_SSE2__) - traverse_mask = NODE_INTERSECT(kg, - P, + traverse_mask = NODE_INTERSECT(kg, + P, # if BVH_FEATURE(BVH_HAIR) - dir, + dir, # endif - idir, - isect_t, - node_addr, - visibility, - dist); + idir, + isect_t, + node_addr, + visibility, + dist); #else // __KERNEL_SSE2__ - traverse_mask = NODE_INTERSECT(kg, - P, - dir, + traverse_mask = NODE_INTERSECT(kg, + P, + dir, # if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, + tnear, + tfar, # endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); #endif // __KERNEL_SSE2__ - node_addr = __float_as_int(cnodes.z); - node_addr_child1 = __float_as_int(cnodes.w); - - if(traverse_mask == 3) { - /* Both children were intersected, push the farther one. */ - bool is_closest_child1 = (dist[1] < dist[0]); - if(is_closest_child1) { - int tmp = node_addr; - node_addr = node_addr_child1; - node_addr_child1 = tmp; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_STACK_SIZE); - traversal_stack[stack_ptr] = node_addr_child1; - } - else { - /* One child was intersected. */ - if(traverse_mask == 2) { - node_addr = node_addr_child1; - } - else if(traverse_mask == 0) { - /* Neither child was intersected. */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } - } - } - - /* if node is leaf, fetch triangle list */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - int prim_addr = __float_as_int(leaf.x); + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); + + if (traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if (is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; + } + else { + /* One child was intersected. */ + if (traverse_mask == 2) { + node_addr = node_addr_child1; + } + else if (traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } + } + } + + /* if node is leaf, fetch triangle list */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - const int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - bool hit; - - /* pop */ - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - - /* primitive intersection */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - /* intersect ray against primitive */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - hit = triangle_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr); - if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; + const int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + bool hit; + + /* pop */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + + /* primitive intersection */ + switch (type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + /* intersect ray against primitive */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* only primitives from volume object */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); + if (hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; #if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; #endif - isect_array->t = isect_t; - if(num_hits == max_hits) { + isect_array->t = isect_t; + if (num_hits == max_hits) { #if BVH_FEATURE(BVH_INSTANCING) - if(object != OBJECT_NONE) { + if (object != OBJECT_NONE) { # if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); # else - Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); + Transform itfm = object_fetch_transform( + kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); # endif - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } -#endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - /* intersect ray against primitive */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - hit = motion_triangle_intersect(kg, - isect_array, - P, - dir, - ray->time, - visibility, - object, - prim_addr); - if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; + case PRIMITIVE_MOTION_TRIANGLE: { + /* intersect ray against primitive */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* only primitives from volume object */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + hit = motion_triangle_intersect( + kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + if (hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; # if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; # endif - isect_array->t = isect_t; - if(num_hits == max_hits) { + isect_array->t = isect_t; + if (num_hits == max_hits) { # if BVH_FEATURE(BVH_INSTANCING) - if(object != OBJECT_NONE) { + if (object != OBJECT_NONE) { # if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); # else - Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); + Transform itfm = object_fetch_transform( + kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); # endif - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } -# endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } -#endif /* BVH_MOTION */ - default: { - break; - } - } - } + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } +# endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#endif /* BVH_MOTION */ + default: { + break; + } + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* instance push */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { + else { + /* instance push */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if (object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - num_hits_in_instance = 0; - isect_array->t = isect_t; + num_hits_in_instance = 0; + isect_array->t = isect_t; # if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect_t); + tfar = ssef(isect_t); # endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_STACK_SIZE); - traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* pop */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* pop */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ - if(num_hits_in_instance) { - float t_fac; + /* Instance pop. */ + if (num_hits_in_instance) { + float t_fac; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); # else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - /* Scale isect->t to adjust for instancing. */ - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } - else { + /* Scale isect->t to adjust for instancing. */ + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } + else { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - } + } - isect_t = tmax; - isect_array->t = isect_t; + isect_t = tmax; + isect_array->t = isect_t; # if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); # if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect_t); + tfar = ssef(isect_t); # endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr]; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return num_hits; + return num_hits; } ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, @@ -387,32 +382,20 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, const uint max_hits, const uint visibility) { - switch(kernel_data.bvh.bvh_layout) { + switch (kernel_data.bvh.bvh_layout) { #ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, - ray, - isect_array, - max_hits, - visibility); + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect_array, max_hits, visibility); #endif #ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - isect_array, - max_hits, - visibility); + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, max_hits, visibility); #endif - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect_array, - max_hits, - visibility); - } - kernel_assert(!"Should not happen"); - return 0; + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, max_hits, visibility); + } + kernel_assert(!"Should not happen"); + return 0; } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h index f449cefb335..e6bb548bc5b 100644 --- a/intern/cycles/kernel/bvh/obvh_local.h +++ b/intern/cycles/kernel/bvh/obvh_local.h @@ -34,372 +34,365 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, uint *lcg_state, int max_hits) { - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_tex_fetch(__object_node, local_object); + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_tex_fetch(__object_node, local_object); - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = ray->t; + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = ray->t; - if(local_isect != NULL) { - local_isect->num_hits = 0; - } - kernel_assert((local_isect == NULL) == (max_hits == 0)); + if (local_isect != NULL) { + local_isect->num_hits = 0; + } + kernel_assert((local_isect == NULL) == (max_hits == 0)); - const int object_flag = kernel_tex_fetch(__object_flag, local_object); - if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + const int object_flag = kernel_tex_fetch(__object_flag, local_object); + if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; - isect_t = bvh_instance_motion_push(kg, - local_object, - ray, - &P, - &dir, - &idir, - isect_t, - &ob_itfm); + Transform ob_itfm; + isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm); #else - isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); + isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); #endif - object = local_object; - } + object = local_object; + } - avxf tnear(0.0f), tfar(isect_t); + avxf tnear(0.0f), tfar(isect_t); #if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); #endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - avxf dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); - if(child_mask != 0) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - avxf cnodes; + if (child_mask != 0) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + avxf cnodes; #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); + } + else #endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); - } + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); + } - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float*)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float *)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - int prim_addr = __float_as_int(leaf.x); + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + int prim_addr = __float_as_int(leaf.x); - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; - /* Primitive intersection. */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - /* Intersect ray against primitive, */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(triangle_intersect_local(kg, - local_isect, - P, - dir, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) - { - return true; - } - } - break; - } + /* Primitive intersection. */ + switch (type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + /* Intersect ray against primitive, */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (triangle_intersect_local(kg, + local_isect, + P, + dir, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits)) { + return true; + } + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - /* Intersect ray against primitive. */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(motion_triangle_intersect_local(kg, - local_isect, - P, - dir, - ray->time, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) - { - return true; - } - } - break; - } + case PRIMITIVE_MOTION_TRIANGLE: { + /* Intersect ray against primitive. */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (motion_triangle_intersect_local(kg, + local_isect, + P, + dir, + ray->time, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits)) { + return true; + } + } + break; + } #endif - default: - break; - } - } - } while(node_addr != ENTRYPOINT_SENTINEL); - } while(node_addr != ENTRYPOINT_SENTINEL); - return false; + default: + break; + } + } + } while (node_addr != ENTRYPOINT_SENTINEL); + } while (node_addr != ENTRYPOINT_SENTINEL); + return false; } #undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h index 93f35f6dffb..6831562cade 100644 --- a/intern/cycles/kernel/bvh/obvh_nodes.h +++ b/intern/cycles/kernel/bvh/obvh_nodes.h @@ -17,11 +17,11 @@ */ struct OBVHStackItem { - int addr; - float dist; + int addr; + float dist; }; -ccl_device_inline void obvh_near_far_idx_calc(const float3& idir, +ccl_device_inline void obvh_near_far_idx_calc(const float3 &idir, int *ccl_restrict near_x, int *ccl_restrict near_y, int *ccl_restrict near_z, @@ -31,41 +31,73 @@ ccl_device_inline void obvh_near_far_idx_calc(const float3& idir, { #ifdef __KERNEL_SSE__ - *near_x = 0; *far_x = 1; - *near_y = 2; *far_y = 3; - *near_z = 4; *far_z = 5; - - const size_t mask = movemask(ssef(idir.m128)); - - const int mask_x = mask & 1; - const int mask_y = (mask & 2) >> 1; - const int mask_z = (mask & 4) >> 2; - - *near_x += mask_x; *far_x -= mask_x; - *near_y += mask_y; *far_y -= mask_y; - *near_z += mask_z; *far_z -= mask_z; + *near_x = 0; + *far_x = 1; + *near_y = 2; + *far_y = 3; + *near_z = 4; + *far_z = 5; + + const size_t mask = movemask(ssef(idir.m128)); + + const int mask_x = mask & 1; + const int mask_y = (mask & 2) >> 1; + const int mask_z = (mask & 4) >> 2; + + *near_x += mask_x; + *far_x -= mask_x; + *near_y += mask_y; + *far_y -= mask_y; + *near_z += mask_z; + *far_z -= mask_z; #else - if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; } - if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; } - if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; } + if (idir.x >= 0.0f) { + *near_x = 0; + *far_x = 1; + } + else { + *near_x = 1; + *far_x = 0; + } + if (idir.y >= 0.0f) { + *near_y = 2; + *far_y = 3; + } + else { + *near_y = 3; + *far_y = 2; + } + if (idir.z >= 0.0f) { + *near_z = 4; + *far_z = 5; + } + else { + *near_z = 5; + *far_z = 4; + } #endif } -ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a, - OBVHStackItem *ccl_restrict b) +ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a, OBVHStackItem *ccl_restrict b) { - OBVHStackItem tmp = *a; - *a = *b; - *b = tmp; + OBVHStackItem tmp = *a; + *a = *b; + *b = tmp; } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s2, OBVHStackItem *ccl_restrict s3) { - if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } - if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); } - if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } + if (s2->dist < s1->dist) { + obvh_item_swap(s2, s1); + } + if (s3->dist < s2->dist) { + obvh_item_swap(s3, s2); + } + if (s2->dist < s1->dist) { + obvh_item_swap(s2, s1); + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -73,11 +105,21 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s3, OBVHStackItem *ccl_restrict s4) { - if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } - if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); } - if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); } - if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); } - if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); } + if (s2->dist < s1->dist) { + obvh_item_swap(s2, s1); + } + if (s4->dist < s3->dist) { + obvh_item_swap(s4, s3); + } + if (s3->dist < s1->dist) { + obvh_item_swap(s3, s1); + } + if (s4->dist < s2->dist) { + obvh_item_swap(s4, s2); + } + if (s3->dist < s2->dist) { + obvh_item_swap(s3, s2); + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -86,19 +128,19 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s4, OBVHStackItem *ccl_restrict s5) { - obvh_stack_sort(s1, s2, s3, s4); - if(s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if(s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if(s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if(s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } + obvh_stack_sort(s1, s2, s3, s4); + if (s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if (s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if (s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if (s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -108,22 +150,22 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s5, OBVHStackItem *ccl_restrict s6) { - obvh_stack_sort(s1, s2, s3, s4, s5); - if(s6->dist < s5->dist) { - obvh_item_swap(s5, s6); - if(s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if(s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if(s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if(s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } - } + obvh_stack_sort(s1, s2, s3, s4, s5); + if (s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if (s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if (s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if (s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if (s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -134,25 +176,25 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s6, OBVHStackItem *ccl_restrict s7) { - obvh_stack_sort(s1, s2, s3, s4, s5, s6); - if(s7->dist < s6->dist) { - obvh_item_swap(s6, s7); - if(s6->dist < s5->dist) { - obvh_item_swap(s5, s6); - if(s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if(s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if(s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if(s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } - } - } + obvh_stack_sort(s1, s2, s3, s4, s5, s6); + if (s7->dist < s6->dist) { + obvh_item_swap(s6, s7); + if (s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if (s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if (s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if (s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if (s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -164,41 +206,41 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s7, OBVHStackItem *ccl_restrict s8) { - obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7); - if(s8->dist < s7->dist) { - obvh_item_swap(s7, s8); - if(s7->dist < s6->dist) { - obvh_item_swap(s6, s7); - if(s6->dist < s5->dist) { - obvh_item_swap(s5, s6); - if(s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if(s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if(s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if(s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } - } - } - } + obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7); + if (s8->dist < s7->dist) { + obvh_item_swap(s7, s8); + if (s7->dist < s6->dist) { + obvh_item_swap(s6, s7); + if (s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if (s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if (s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if (s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if (s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } + } + } } /* Axis-aligned nodes intersection */ ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& org_idir, + const avx3f &org_idir, #else - const avx3f& org, + const avx3f &org, #endif - const avx3f& idir, + const avx3f &idir, const int near_x, const int near_y, const int near_z, @@ -208,213 +250,216 @@ ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg const int node_addr, avxf *ccl_restrict dist) { - const int offset = node_addr + 2; + const int offset = node_addr + 2; #ifdef __KERNEL_AVX2__ - const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x); - const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y); - const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z); - const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x); - const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y); - const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z); - - const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); - const avxb vmask = tnear <= tfar; - int mask = (int)movemask(vmask); - *dist = tnear; - return mask; + const avxf tnear_x = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, org_idir.x); + const avxf tnear_y = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, org_idir.y); + const avxf tnear_z = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, org_idir.z); + const avxf tfar_x = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, org_idir.x); + const avxf tfar_y = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, org_idir.y); + const avxf tfar_z = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, org_idir.z); + + const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const avxb vmask = tnear <= tfar; + int mask = (int)movemask(vmask); + *dist = tnear; + return mask; #else - return 0; + return 0; #endif } -ccl_device_inline int obvh_aligned_node_intersect_robust( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_aligned_node_intersect_robust(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& P_idir, + const avx3f &P_idir, #else - const avx3f& P, + const avx3f &P, #endif - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - const float difl, - avxf *ccl_restrict dist) + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) { - const int offset = node_addr + 2; + const int offset = node_addr + 2; #ifdef __KERNEL_AVX2__ - const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x); - const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x); - const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y); - const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y); - const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z); - const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z); - - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); - const avxb vmask = round_down*tnear <= round_up*tfar; - int mask = (int)movemask(vmask); - *dist = tnear; - return mask; + const avxf tnear_x = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x); + const avxf tfar_x = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x); + const avxf tnear_y = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y); + const avxf tfar_y = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y); + const avxf tnear_z = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z); + const avxf tfar_z = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z); + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const avxb vmask = round_down * tnear <= round_up * tfar; + int mask = (int)movemask(vmask); + *dist = tnear; + return mask; #else - return 0; + return 0; #endif } /* Unaligned nodes intersection */ -ccl_device_inline int obvh_unaligned_node_intersect( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& org_idir, + const avx3f &org_idir, #endif - const avx3f& org, - const avx3f& dir, - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - avxf *ccl_restrict dist) + const avx3f &org, + const avx3f &dir, + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) { - const int offset = node_addr; - const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2); - const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4); - const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6); - - const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8); - const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10); - const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12); - - const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14); - const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16); - const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18); - - const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20); - const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22); - const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24); - - const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, - aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, - aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; - - const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x, - aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y, - aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z; - - const avxf neg_one(-1.0f); - const avxf nrdir_x = neg_one / aligned_dir_x, - nrdir_y = neg_one / aligned_dir_y, - nrdir_z = neg_one / aligned_dir_z; - - const avxf tlower_x = aligned_P_x * nrdir_x, - tlower_y = aligned_P_y * nrdir_y, - tlower_z = aligned_P_z * nrdir_z; - - const avxf tupper_x = tlower_x - nrdir_x, - tupper_y = tlower_y - nrdir_y, - tupper_z = tlower_z - nrdir_z; - - const avxf tnear_x = min(tlower_x, tupper_x); - const avxf tnear_y = min(tlower_y, tupper_y); - const avxf tnear_z = min(tlower_z, tupper_z); - const avxf tfar_x = max(tlower_x, tupper_x); - const avxf tfar_y = max(tlower_y, tupper_y); - const avxf tfar_z = max(tlower_z, tupper_z); - const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const avxb vmask = tnear <= tfar; - *dist = tnear; - return movemask(vmask); + const int offset = node_addr; + const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 2); + const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 4); + const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 6); + + const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 8); + const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 10); + const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 12); + + const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 14); + const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 16); + const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 18); + + const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 20); + const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 22); + const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 24); + + const avxf aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z, + aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z, + aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z; + + const avxf aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x, + aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y, + aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z; + + const avxf neg_one(-1.0f); + const avxf nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const avxf tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const avxf tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const avxf tnear_x = min(tlower_x, tupper_x); + const avxf tnear_y = min(tlower_y, tupper_y); + const avxf tnear_z = min(tlower_z, tupper_z); + const avxf tfar_x = max(tlower_x, tupper_x); + const avxf tfar_y = max(tlower_y, tupper_y); + const avxf tfar_z = max(tlower_z, tupper_z); + const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const avxb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); } -ccl_device_inline int obvh_unaligned_node_intersect_robust( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_unaligned_node_intersect_robust(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& P_idir, + const avx3f &P_idir, #endif - const avx3f& P, - const avx3f& dir, - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - const float difl, - avxf *ccl_restrict dist) + const avx3f &P, + const avx3f &dir, + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) { - const int offset = node_addr; - const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2); - const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4); - const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6); - - const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8); - const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10); - const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12); - - const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14); - const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16); - const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18); - - const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20); - const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22); - const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24); - - const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, - aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, - aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; - - const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x, - aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y, - aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z; - - const avxf neg_one(-1.0f); - const avxf nrdir_x = neg_one / aligned_dir_x, - nrdir_y = neg_one / aligned_dir_y, - nrdir_z = neg_one / aligned_dir_z; - - const avxf tlower_x = aligned_P_x * nrdir_x, - tlower_y = aligned_P_y * nrdir_y, - tlower_z = aligned_P_z * nrdir_z; - - const avxf tupper_x = tlower_x - nrdir_x, - tupper_y = tlower_y - nrdir_y, - tupper_z = tlower_z - nrdir_z; - - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - - const avxf tnear_x = min(tlower_x, tupper_x); - const avxf tnear_y = min(tlower_y, tupper_y); - const avxf tnear_z = min(tlower_z, tupper_z); - const avxf tfar_x = max(tlower_x, tupper_x); - const avxf tfar_y = max(tlower_y, tupper_y); - const avxf tfar_z = max(tlower_z, tupper_z); - - const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const avxb vmask = round_down*tnear <= round_up*tfar; - *dist = tnear; - return movemask(vmask); + const int offset = node_addr; + const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 2); + const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 4); + const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 6); + + const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 8); + const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 10); + const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 12); + + const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 14); + const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 16); + const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 18); + + const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 20); + const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 22); + const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 24); + + const avxf aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z, + aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z, + aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z; + + const avxf aligned_P_x = P.x * tfm_x_x + P.y * tfm_x_y + P.z * tfm_x_z + tfm_t_x, + aligned_P_y = P.x * tfm_y_x + P.y * tfm_y_y + P.z * tfm_y_z + tfm_t_y, + aligned_P_z = P.x * tfm_z_x + P.y * tfm_z_y + P.z * tfm_z_z + tfm_t_z; + + const avxf neg_one(-1.0f); + const avxf nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const avxf tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const avxf tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + + const avxf tnear_x = min(tlower_x, tupper_x); + const avxf tnear_y = min(tlower_y, tupper_y); + const avxf tnear_z = min(tlower_z, tupper_z); + const avxf tfar_x = max(tlower_x, tupper_x); + const avxf tfar_y = max(tlower_y, tupper_y); + const avxf tfar_z = max(tlower_z, tupper_z); + + const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const avxb vmask = round_down * tnear <= round_up * tfar; + *dist = tnear; + return movemask(vmask); } /* Intersectors wrappers. @@ -422,111 +467,125 @@ ccl_device_inline int obvh_unaligned_node_intersect_robust( * They'll check node type and call appropriate intersection code. */ -ccl_device_inline int obvh_node_intersect( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_node_intersect(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& org_idir, + const avx3f &org_idir, #endif - const avx3f& org, - const avx3f& dir, - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - avxf *ccl_restrict dist) + const avx3f &org, + const avx3f &dir, + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) { - const int offset = node_addr; - const float4 node = kernel_tex_fetch(__bvh_nodes, offset); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return obvh_unaligned_node_intersect(kg, - isect_near, - isect_far, + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return obvh_unaligned_node_intersect(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - org_idir, + org_idir, #endif - org, - dir, - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - dist); - } - else { - return obvh_aligned_node_intersect(kg, - isect_near, - isect_far, + org, + dir, + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + dist); + } + else { + return obvh_aligned_node_intersect(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - org_idir, + org_idir, #else - org, + org, #endif - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - dist); - } + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + dist); + } } -ccl_device_inline int obvh_node_intersect_robust( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_node_intersect_robust(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& P_idir, + const avx3f &P_idir, #endif - const avx3f& P, - const avx3f& dir, - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - const float difl, - avxf *ccl_restrict dist) + const avx3f &P, + const avx3f &dir, + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) { - const int offset = node_addr; - const float4 node = kernel_tex_fetch(__bvh_nodes, offset); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return obvh_unaligned_node_intersect_robust(kg, - isect_near, - isect_far, + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return obvh_unaligned_node_intersect_robust(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - P_idir, + P_idir, #endif - P, - dir, - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - difl, - dist); - } - else { - return obvh_aligned_node_intersect_robust(kg, - isect_near, - isect_far, + P, + dir, + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + difl, + dist); + } + else { + return obvh_aligned_node_intersect_robust(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - P_idir, + P_idir, #else - P, + P, #endif - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - difl, - dist); - } + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + difl, + dist); + } } diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h index 10d5422c31c..98efb003788 100644 --- a/intern/cycles/kernel/bvh/obvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/obvh_shadow_all.h @@ -36,645 +36,635 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, const uint max_hits, uint *num_hits) { - /* TODO(sergey): - * - Test if pushing distance on the stack helps. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; + /* TODO(sergey): + * - Test if pushing distance on the stack helps. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - *num_hits = 0; - isect_array->t = tmax; + *num_hits = 0; + isect_array->t = tmax; #if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; + int num_hits_in_instance = 0; #endif - avxf tnear(0.0f), tfar(isect_t); + avxf tnear(0.0f), tfar(isect_t); #if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); #endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - (void) inodes; - - if(false + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + (void)inodes; + + if (false #ifdef __VISIBILITY_FLAG__ - || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) + || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) #endif #if BVH_FEATURE(BVH_MOTION) - || UNLIKELY(ray->time < inodes.y) - || UNLIKELY(ray->time > inodes.z) + || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z) #endif - ) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - avxf dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + ) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) -//#if !defined(__KERNEL_AVX2__) - org4, + //#if !defined(__KERNEL_AVX2__) + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); - - if(child_mask != 0) { - avxf cnodes; + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); + + if (child_mask != 0) { + avxf cnodes; #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); + } + else #endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float*)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float *)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + if ((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } #endif - int prim_addr = __float_as_int(leaf.x); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - if(p_type == PRIMITIVE_TRIANGLE) { - int prim_count = prim_addr2 - prim_addr; - if(prim_count < 3) { - while(prim_addr < prim_addr2) { - kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - int hit = triangle_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr); - /* Shadow ray early termination. */ - if(hit) { - /* detect if this surface has a shader with transparent shadows */ - - /* todo: optimize so primitive visibility flag indicates if - * the primitive has a transparent shadow shader? */ - int prim = kernel_tex_fetch(__prim_index, isect_array->prim); - int shader = 0; + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + if (p_type == PRIMITIVE_TRIANGLE) { + int prim_count = prim_addr2 - prim_addr; + if (prim_count < 3) { + while (prim_addr < prim_addr2) { + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == + p_type); + int hit = triangle_intersect( + kg, isect_array, P, dir, PATH_RAY_SHADOW, object, prim_addr); + /* Shadow ray early termination. */ + if (hit) { + /* detect if this surface has a shader with transparent shadows */ + + /* todo: optimize so primitive visibility flag indicates if + * the primitive has a transparent shadow shader? */ + int prim = kernel_tex_fetch(__prim_index, isect_array->prim); + int shader = 0; #ifdef __HAIR__ - if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) + if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) #endif - { - shader = kernel_tex_fetch(__tri_shader, prim); - } + { + shader = kernel_tex_fetch(__tri_shader, prim); + } #ifdef __HAIR__ - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } #endif - int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - - /* if no transparent shadows, all light is blocked */ - if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { - return true; - } - /* if maximum number of hits reached, block all light */ - else if(*num_hits == max_hits) { - return true; - } - - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + + /* if no transparent shadows, all light is blocked */ + if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return true; + } + /* if maximum number of hits reached, block all light */ + else if (*num_hits == max_hits) { + return true; + } + + /* move on to next entry in intersections array */ + isect_array++; + (*num_hits)++; #if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; #endif - isect_array->t = isect_t; - } + isect_array->t = isect_t; + } - prim_addr++; - } //while - } else { - kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == p_type); + prim_addr++; + } //while + } + else { + kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == + p_type); #if BVH_FEATURE(BVH_INSTANCING) - int* nhiptr = &num_hits_in_instance; + int *nhiptr = &num_hits_in_instance; #else - int nhi= 0; - int *nhiptr = &nhi; + int nhi = 0; + int *nhiptr = &nhi; #endif - int result = triangle_intersect8(kg, - &isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - prim_count, - num_hits, - max_hits, - nhiptr, - isect_t); - if(result == 2) { - return true; - } - } // prim_count - } // PRIMITIVE_TRIANGLE - else { - while(prim_addr < prim_addr2) { - kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); + int result = triangle_intersect8(kg, + &isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + prim_count, + num_hits, + max_hits, + nhiptr, + isect_t); + if (result == 2) { + return true; + } + } // prim_count + } // PRIMITIVE_TRIANGLE + else { + while (prim_addr < prim_addr2) { + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); #ifdef __SHADOW_TRICKS__ - uint tri_object = (object == OBJECT_NONE) - ? kernel_tex_fetch(__prim_object, prim_addr) - : object; - if(tri_object == skip_object) { - ++prim_addr; - continue; - } + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + if (tri_object == skip_object) { + ++prim_addr; + continue; + } #endif - bool hit; + bool hit; - /* todo: specialized intersect functions which don't fill in - * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? - * might give a few % performance improvement */ + /* todo: specialized intersect functions which don't fill in + * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? + * might give a few % performance improvement */ - switch(p_type) { + switch (p_type) { #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect(kg, - isect_array, - P, - dir, - ray->time, - PATH_RAY_SHADOW, - object, - prim_addr); - break; - } + case PRIMITIVE_MOTION_TRIANGLE: { + hit = motion_triangle_intersect( + kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, prim_addr); + break; + } #endif #if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); - } - else { - hit = curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); - } - break; - } + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, + 0); + } + else { + hit = curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, + 0); + } + break; + } #endif - default: { - hit = false; - break; - } - } + default: { + hit = false; + break; + } + } - /* Shadow ray early termination. */ - if(hit) { - /* detect if this surface has a shader with transparent shadows */ + /* Shadow ray early termination. */ + if (hit) { + /* detect if this surface has a shader with transparent shadows */ - /* todo: optimize so primitive visibility flag indicates if - * the primitive has a transparent shadow shader? */ - int prim = kernel_tex_fetch(__prim_index, isect_array->prim); - int shader = 0; + /* todo: optimize so primitive visibility flag indicates if + * the primitive has a transparent shadow shader? */ + int prim = kernel_tex_fetch(__prim_index, isect_array->prim); + int shader = 0; #ifdef __HAIR__ - if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) + if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) #endif - { - shader = kernel_tex_fetch(__tri_shader, prim); - } + { + shader = kernel_tex_fetch(__tri_shader, prim); + } #ifdef __HAIR__ - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } #endif - int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - - /* if no transparent shadows, all light is blocked */ - if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { - return true; - } - /* if maximum number of hits reached, block all light */ - else if(*num_hits == max_hits) { - return true; - } - - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + + /* if no transparent shadows, all light is blocked */ + if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return true; + } + /* if maximum number of hits reached, block all light */ + else if (*num_hits == max_hits) { + return true; + } + + /* move on to next entry in intersections array */ + isect_array++; + (*num_hits)++; #if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; #endif - isect_array->t = isect_t; - } + isect_array->t = isect_t; + } - prim_addr++; - }//while prim - } - } + prim_addr++; + } //while prim + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); # if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - num_hits_in_instance = 0; - isect_array->t = isect_t; + num_hits_in_instance = 0; + isect_array->t = isect_t; - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = avxf(isect_t); + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = avxf(isect_t); # if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); # endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + node_addr = kernel_tex_fetch(__object_node, object); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ - if(num_hits_in_instance) { - float t_fac; + /* Instance pop. */ + if (num_hits_in_instance) { + float t_fac; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); # else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - /* Scale isect->t to adjust for instancing. */ - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } - else { + /* Scale isect->t to adjust for instancing. */ + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } + else { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - } + } - isect_t = tmax; - isect_array->t = isect_t; + isect_t = tmax; + isect_array->t = isect_t; - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = avxf(isect_t); + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = avxf(isect_t); # if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); # endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return false; + return false; } #undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h index 5df7a3be515..86b1de48aaa 100644 --- a/intern/cycles/kernel/bvh/obvh_traversal.h +++ b/intern/cycles/kernel/bvh/obvh_traversal.h @@ -37,598 +37,583 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, Intersection *isect, const uint visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - ,uint *lcg_state, + , + uint *lcg_state, float difl, float extmax #endif - ) +) { - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - traversal_stack[0].dist = -FLT_MAX; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - float node_dist = -FLT_MAX; - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + traversal_stack[0].dist = -FLT_MAX; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + float node_dist = -FLT_MAX; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; - BVH_DEBUG_INIT(); - avxf tnear(0.0f), tfar(ray->t); + BVH_DEBUG_INIT(); + avxf tnear(0.0f), tfar(ray->t); #if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); #endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); + avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - (void) inodes; - - if(UNLIKELY(node_dist > isect->t) + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + (void)inodes; + + if (UNLIKELY(node_dist > isect->t) #if BVH_FEATURE(BVH_MOTION) - || UNLIKELY(ray->time < inodes.y) - || UNLIKELY(ray->time > inodes.z) + || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z) #endif #ifdef __VISIBILITY_FLAG__ - || (__float_as_uint(inodes.x) & visibility) == 0 + || (__float_as_uint(inodes.x) & visibility) == 0 #endif - ) - { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } + ) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } - int child_mask; - avxf dist; + int child_mask; + avxf dist; - BVH_DEBUG_NEXT_NODE(); + BVH_DEBUG_NEXT_NODE(); #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - if(difl != 0.0f) { - /* NOTE: We extend all the child BB instead of fetching - * and checking visibility flags for each of the, - * - * Need to test if doing opposite would be any faster. - */ - child_mask = NODE_INTERSECT_ROBUST(kg, - tnear, - tfar, + if (difl != 0.0f) { + /* NOTE: We extend all the child BB instead of fetching + * and checking visibility flags for each of the, + * + * Need to test if doing opposite would be any faster. + */ + child_mask = NODE_INTERSECT_ROBUST(kg, + tnear, + tfar, # ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, # endif # if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, # endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - difl, - &dist); - } - else -#endif /* BVH_HAIR_MINIMUM_WIDTH */ - { - child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + difl, + &dist); + } + else +#endif /* BVH_HAIR_MINIMUM_WIDTH */ + { + child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); - } - - if(child_mask != 0) { - avxf cnodes; - /* TODO(sergey): Investigate whether moving cnodes upwards - * gives a speedup (will be different cache pattern but will - * avoid extra check here). - */ + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); + } + + if (child_mask != 0) { + avxf cnodes; + /* TODO(sergey): Investigate whether moving cnodes upwards + * gives a speedup (will be different cache pattern but will + * avoid extra check here). + */ #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); + } + else #endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - float d0 = ((float*)&dist)[r]; - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - node_dist = d0; - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - node_dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - node_dist = d0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + float d0 = ((float *)&dist)[r]; + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + node_dist = d0; + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + node_dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + node_dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); #ifdef __VISIBILITY_FLAG__ - if(UNLIKELY((node_dist > isect->t) || - ((__float_as_uint(leaf.z) & visibility) == 0))) + if (UNLIKELY((node_dist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0))) #else - if(UNLIKELY((node_dist > isect->t))) + if (UNLIKELY((node_dist > isect->t))) #endif - { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - int prim_addr = __float_as_int(leaf.x); + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - - /* Primitive intersection. */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - int prim_count = prim_addr2 - prim_addr; - if(prim_count < 3) { - for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(triangle_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr)) - { - tfar = avxf(isect->t); - /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - }//for - } - else { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(triangle_intersect8(kg, - &isect, - P, - dir, - visibility, - object, - prim_addr, - prim_count, - 0, - 0, - NULL, - 0.0f)) - { - tfar = avxf(isect->t); - if(visibility == PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - }//prim count - break; - } + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + + /* Primitive intersection. */ + switch (type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + int prim_count = prim_addr2 - prim_addr; + if (prim_count < 3) { + for (; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) { + tfar = avxf(isect->t); + /* Shadow ray early termination. */ + if (visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } //for + } + else { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (triangle_intersect8(kg, + &isect, + P, + dir, + visibility, + object, + prim_addr, + prim_count, + 0, + 0, + NULL, + 0.0f)) { + tfar = avxf(isect->t); + if (visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } //prim count + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(motion_triangle_intersect(kg, - isect, - P, - dir, - ray->time, - visibility, - object, - prim_addr)) - { - tfar = avxf(isect->t); - /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } -#endif /* BVH_FEATURE(BVH_MOTION) */ + case PRIMITIVE_MOTION_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (motion_triangle_intersect( + kg, isect, P, dir, ray->time, visibility, object, prim_addr)) { + tfar = avxf(isect->t); + /* Shadow ray early termination. */ + if (visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_MOTION) */ #if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); - bool hit; - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); - } - else { - hit = curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); - } - if(hit) { - tfar = avxf(isect->t); - /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } -#endif /* BVH_FEATURE(BVH_HAIR) */ - } - } + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + for (; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); + bool hit; + if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + else { + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + if (hit) { + tfar = avxf(isect->t); + /* Shadow ray early termination. */ + if (visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_HAIR) */ + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); # if BVH_FEATURE(BVH_MOTION) - qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); + qbvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); # else - qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); + qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); # endif - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = avxf(isect->t); + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = avxf(isect->t); # if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); # endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - traversal_stack[stack_ptr].dist = -FLT_MAX; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + traversal_stack[stack_ptr].dist = -FLT_MAX; - node_addr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); - BVH_DEBUG_NEXT_INSTANCE(); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + BVH_DEBUG_NEXT_INSTANCE(); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ + /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = avxf(isect->t); + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = avxf(isect->t); # if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); # endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return (isect->prim != PRIM_NONE); + return (isect->prim != PRIM_NONE); } #undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h index e66d499dccc..fb41ae783ab 100644 --- a/intern/cycles/kernel/bvh/obvh_volume.h +++ b/intern/cycles/kernel/bvh/obvh_volume.h @@ -33,444 +33,448 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, Intersection *isect, const uint visibility) { - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; - avxf tnear(0.0f), tfar(ray->t); + avxf tnear(0.0f), tfar(ray->t); #if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); #endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(inodes.x) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + if ((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } #endif - avxf dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); - - if(child_mask != 0) { - avxf cnodes; + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); + + if (child_mask != 0) { + avxf cnodes; #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); + } + else #endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float*)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - - if((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - int prim_addr = __float_as_int(leaf.x); + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float *)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + + if ((__float_as_uint(leaf.z) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch(p_type) { - case PRIMITIVE_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); - } - break; - } + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch (p_type) { + case PRIMITIVE_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr); - } - break; - } + case PRIMITIVE_MOTION_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + motion_triangle_intersect( + kg, isect, P, dir, ray->time, visibility, object, prim_addr); + } + break; + } #endif - } - } + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if (object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = avxf(isect->t); + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = avxf(isect->t); # if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); # endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* Pop. */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ + /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = avxf(isect->t); + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = avxf(isect->t); # if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); # endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return (isect->prim != PRIM_NONE); + return (isect->prim != PRIM_NONE); } #undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h index 5476f79712a..56e2afd4a11 100644 --- a/intern/cycles/kernel/bvh/obvh_volume_all.h +++ b/intern/cycles/kernel/bvh/obvh_volume_all.h @@ -34,514 +34,518 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, const uint max_hits, const uint visibility) { - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - uint num_hits = 0; - isect_array->t = tmax; + uint num_hits = 0; + isect_array->t = tmax; #if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; + int num_hits_in_instance = 0; #endif - avxf tnear(0.0f), tfar(isect_t); + avxf tnear(0.0f), tfar(isect_t); #if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); #endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(inodes.x) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + if ((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } #endif - avxf dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); - - if(child_mask != 0) { - avxf cnodes; + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); + + if (child_mask != 0) { + avxf cnodes; #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); + } + else #endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float*)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - - if((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - int prim_addr = __float_as_int(leaf.x); + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float *)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + + if ((__float_as_uint(leaf.z) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - bool hit; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch(p_type) { - case PRIMITIVE_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); - if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + bool hit; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch (p_type) { + case PRIMITIVE_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); + if (hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; #if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; #endif - isect_array->t = isect_t; - if(num_hits == max_hits) { + isect_array->t = isect_t; + if (num_hits == max_hits) { #if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); # else - Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); + Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); # endif - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } -#endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); - if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; + case PRIMITIVE_MOTION_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = motion_triangle_intersect( + kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + if (hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; # if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; # endif - isect_array->t = isect_t; - if(num_hits == max_hits) { + isect_array->t = isect_t; + if (num_hits == max_hits) { # if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); # else - Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); + Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); # endif - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } -# endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } +# endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } #endif - } - } + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if (object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = avxf(isect_t); - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = avxf(isect_t); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); # if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); # endif # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); # endif - num_hits_in_instance = 0; - isect_array->t = isect_t; - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* Pop. */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + num_hits_in_instance = 0; + isect_array->t = isect_t; + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ - if(num_hits_in_instance) { - float t_fac; + /* Instance pop. */ + if (num_hits_in_instance) { + float t_fac; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); # else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - /* Scale isect->t to adjust for instancing. */ - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } - else { + /* Scale isect->t to adjust for instancing. */ + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } + else { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - } + } - isect_t = tmax; - isect_array->t = isect_t; + isect_t = tmax; + isect_array->t = isect_t; - obvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = avxf(isect_t); + obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = avxf(isect_t); # if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); # endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return num_hits; + return num_hits; } #undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_local.h b/intern/cycles/kernel/bvh/qbvh_local.h index 661182e31b3..b21f79bd3a0 100644 --- a/intern/cycles/kernel/bvh/qbvh_local.h +++ b/intern/cycles/kernel/bvh/qbvh_local.h @@ -35,262 +35,257 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, uint *lcg_state, int max_hits) { - /* TODO(sergey): - * - Test if pushing distance on the stack helps (for non shadow rays). - * - Separate version for shadow rays. - * - Likely and unlikely for if() statements. - * - SSE for hair. - * - Test restrict attribute for pointers. - */ + /* TODO(sergey): + * - Test if pushing distance on the stack helps (for non shadow rays). + * - Separate version for shadow rays. + * - Likely and unlikely for if() statements. + * - SSE for hair. + * - Test restrict attribute for pointers. + */ - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + /* Traversal stack in CUDA thread-local memory. */ + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_tex_fetch(__object_node, local_object); + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_tex_fetch(__object_node, local_object); - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = ray->t; + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = ray->t; - if(local_isect != NULL) { - local_isect->num_hits = 0; - } - kernel_assert((local_isect == NULL) == (max_hits == 0)); + if (local_isect != NULL) { + local_isect->num_hits = 0; + } + kernel_assert((local_isect == NULL) == (max_hits == 0)); - const int object_flag = kernel_tex_fetch(__object_flag, local_object); - if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + const int object_flag = kernel_tex_fetch(__object_flag, local_object); + if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; - isect_t = bvh_instance_motion_push(kg, - local_object, - ray, - &P, - &dir, - &idir, - isect_t, - &ob_itfm); + Transform ob_itfm; + isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm); #else - isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); + isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); #endif - object = local_object; - } + object = local_object; + } - ssef tnear(0.0f), tfar(isect_t); + ssef tnear(0.0f), tfar(isect_t); #if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); #endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - ssef dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + ssef dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); - if(child_mask != 0) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - float4 cnodes; + if (child_mask != 0) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + float4 cnodes; #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); + } + else #endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); - } + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); + } - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float*)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float *)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + } - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - int prim_addr = __float_as_int(leaf.x); + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + int prim_addr = __float_as_int(leaf.x); - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; - /* Primitive intersection. */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - /* Intersect ray against primitive, */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(triangle_intersect_local(kg, - local_isect, - P, - dir, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) { - return true; - } - } - break; - } + /* Primitive intersection. */ + switch (type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + /* Intersect ray against primitive, */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (triangle_intersect_local(kg, + local_isect, + P, + dir, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits)) { + return true; + } + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - /* Intersect ray against primitive. */ - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(motion_triangle_intersect_local(kg, - local_isect, - P, - dir, - ray->time, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) { - return true; - } - } - break; - } + case PRIMITIVE_MOTION_TRIANGLE: { + /* Intersect ray against primitive. */ + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (motion_triangle_intersect_local(kg, + local_isect, + P, + dir, + ray->time, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits)) { + return true; + } + } + break; + } #endif - default: - break; - } - } - } while(node_addr != ENTRYPOINT_SENTINEL); - } while(node_addr != ENTRYPOINT_SENTINEL); + default: + break; + } + } + } while (node_addr != ENTRYPOINT_SENTINEL); + } while (node_addr != ENTRYPOINT_SENTINEL); - return false; + return false; } #undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h index 2e622af1758..7c1d8c8c72e 100644 --- a/intern/cycles/kernel/bvh/qbvh_nodes.h +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -17,11 +17,11 @@ */ struct QBVHStackItem { - int addr; - float dist; + int addr; + float dist; }; -ccl_device_inline void qbvh_near_far_idx_calc(const float3& idir, +ccl_device_inline void qbvh_near_far_idx_calc(const float3 &idir, int *ccl_restrict near_x, int *ccl_restrict near_y, int *ccl_restrict near_z, @@ -31,44 +31,76 @@ ccl_device_inline void qbvh_near_far_idx_calc(const float3& idir, { #ifdef __KERNEL_SSE__ - *near_x = 0; *far_x = 1; - *near_y = 2; *far_y = 3; - *near_z = 4; *far_z = 5; - - const size_t mask = movemask(ssef(idir.m128)); - - const int mask_x = mask & 1; - const int mask_y = (mask & 2) >> 1; - const int mask_z = (mask & 4) >> 2; - - *near_x += mask_x; *far_x -= mask_x; - *near_y += mask_y; *far_y -= mask_y; - *near_z += mask_z; *far_z -= mask_z; + *near_x = 0; + *far_x = 1; + *near_y = 2; + *far_y = 3; + *near_z = 4; + *far_z = 5; + + const size_t mask = movemask(ssef(idir.m128)); + + const int mask_x = mask & 1; + const int mask_y = (mask & 2) >> 1; + const int mask_z = (mask & 4) >> 2; + + *near_x += mask_x; + *far_x -= mask_x; + *near_y += mask_y; + *far_y -= mask_y; + *near_z += mask_z; + *far_z -= mask_z; #else - if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; } - if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; } - if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; } + if (idir.x >= 0.0f) { + *near_x = 0; + *far_x = 1; + } + else { + *near_x = 1; + *far_x = 0; + } + if (idir.y >= 0.0f) { + *near_y = 2; + *far_y = 3; + } + else { + *near_y = 3; + *far_y = 2; + } + if (idir.z >= 0.0f) { + *near_z = 4; + *far_z = 5; + } + else { + *near_z = 5; + *far_z = 4; + } #endif } /* TOOD(sergey): Investigate if using intrinsics helps for both * stack item swap and float comparison. */ -ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a, - QBVHStackItem *ccl_restrict b) +ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a, QBVHStackItem *ccl_restrict b) { - QBVHStackItem tmp = *a; - *a = *b; - *b = tmp; + QBVHStackItem tmp = *a; + *a = *b; + *b = tmp; } ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, QBVHStackItem *ccl_restrict s2, QBVHStackItem *ccl_restrict s3) { - if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } - if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } - if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } + if (s2->dist < s1->dist) { + qbvh_item_swap(s2, s1); + } + if (s3->dist < s2->dist) { + qbvh_item_swap(s3, s2); + } + if (s2->dist < s1->dist) { + qbvh_item_swap(s2, s1); + } } ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, @@ -76,279 +108,283 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, QBVHStackItem *ccl_restrict s3, QBVHStackItem *ccl_restrict s4) { - if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } - if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); } - if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); } - if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); } - if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } + if (s2->dist < s1->dist) { + qbvh_item_swap(s2, s1); + } + if (s4->dist < s3->dist) { + qbvh_item_swap(s4, s3); + } + if (s3->dist < s1->dist) { + qbvh_item_swap(s3, s1); + } + if (s4->dist < s2->dist) { + qbvh_item_swap(s4, s2); + } + if (s3->dist < s2->dist) { + qbvh_item_swap(s3, s2); + } } /* Axis-aligned nodes intersection */ //ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, - const ssef& isect_near, - const ssef& isect_far, + const ssef &isect_near, + const ssef &isect_far, #ifdef __KERNEL_AVX2__ - const sse3f& org_idir, + const sse3f &org_idir, #else - const sse3f& org, + const sse3f &org, #endif - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - ssef *ccl_restrict dist) + const sse3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + ssef *ccl_restrict dist) { - const int offset = node_addr + 1; + const int offset = node_addr + 1; #ifdef __KERNEL_AVX2__ - const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x); - const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y); - const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z); - const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x); - const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y); - const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z); + const ssef tnear_x = msub( + kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x), idir.x, org_idir.x); + const ssef tnear_y = msub( + kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y), idir.y, org_idir.y); + const ssef tnear_z = msub( + kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z), idir.z, org_idir.z); + const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x), idir.x, org_idir.x); + const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y), idir.y, org_idir.y); + const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z), idir.z, org_idir.z); #else - const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x; - const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y; - const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z; - const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x; - const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y; - const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z; + const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x) - org.x) * idir.x; + const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y) - org.y) * idir.y; + const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z) - org.z) * idir.z; + const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x) - org.x) * idir.x; + const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y) - org.y) * idir.y; + const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z) - org.z) * idir.z; #endif #ifdef __KERNEL_SSE41__ - const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near)); - const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far)); - const sseb vmask = cast(tnear) > cast(tfar); - int mask = (int)movemask(vmask)^0xf; + const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near)); + const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far)); + const sseb vmask = cast(tnear) > cast(tfar); + int mask = (int)movemask(vmask) ^ 0xf; #else - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const sseb vmask = tnear <= tfar; - int mask = (int)movemask(vmask); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = tnear <= tfar; + int mask = (int)movemask(vmask); #endif - *dist = tnear; - return mask; + *dist = tnear; + return mask; } -ccl_device_inline int qbvh_aligned_node_intersect_robust( - KernelGlobals *ccl_restrict kg, - const ssef& isect_near, - const ssef& isect_far, +ccl_device_inline int qbvh_aligned_node_intersect_robust(KernelGlobals *ccl_restrict kg, + const ssef &isect_near, + const ssef &isect_far, #ifdef __KERNEL_AVX2__ - const sse3f& P_idir, + const sse3f &P_idir, #else - const sse3f& P, + const sse3f &P, #endif - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - const float difl, - ssef *ccl_restrict dist) + const sse3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + ssef *ccl_restrict dist) { - const int offset = node_addr + 1; + const int offset = node_addr + 1; #ifdef __KERNEL_AVX2__ - const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x); - const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y); - const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z); - const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x); - const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y); - const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z); + const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x), idir.x, P_idir.x); + const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y), idir.y, P_idir.y); + const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z), idir.z, P_idir.z); + const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x), idir.x, P_idir.x); + const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y), idir.y, P_idir.y); + const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z), idir.z, P_idir.z); #else - const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x; - const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y; - const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z; - const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x; - const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y; - const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z; + const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x) - P.x) * idir.x; + const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y) - P.y) * idir.y; + const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z) - P.z) * idir.z; + const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x) - P.x) * idir.x; + const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y) - P.y) * idir.y; + const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z) - P.z) * idir.z; #endif - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const sseb vmask = round_down*tnear <= round_up*tfar; - *dist = tnear; - return (int)movemask(vmask); + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = round_down * tnear <= round_up * tfar; + *dist = tnear; + return (int)movemask(vmask); } /* Unaligned nodes intersection */ -ccl_device_inline int qbvh_unaligned_node_intersect( - KernelGlobals *ccl_restrict kg, - const ssef& isect_near, - const ssef& isect_far, +ccl_device_inline int qbvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg, + const ssef &isect_near, + const ssef &isect_far, #ifdef __KERNEL_AVX2__ - const sse3f& org_idir, + const sse3f &org_idir, #endif - const sse3f& org, - const sse3f& dir, - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - ssef *ccl_restrict dist) + const sse3f &org, + const sse3f &dir, + const sse3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + ssef *ccl_restrict dist) { - const int offset = node_addr; - const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); - const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); - const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); + const int offset = node_addr; + const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 1); + const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 2); + const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 3); - const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); - const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); - const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); + const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 4); + const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 5); + const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 6); - const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); - const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); - const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); + const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 7); + const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 8); + const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 9); - const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); - const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); - const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); + const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 10); + const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 11); + const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 12); - const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, - aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, - aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + const ssef aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z, + aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z, + aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z; - const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x, - aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y, - aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z; + const ssef aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x, + aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y, + aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z; - const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); - const ssef nrdir_x = neg_one / aligned_dir_x, - nrdir_y = neg_one / aligned_dir_y, - nrdir_z = neg_one / aligned_dir_z; + const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); + const ssef nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; - const ssef tlower_x = aligned_P_x * nrdir_x, - tlower_y = aligned_P_y * nrdir_y, - tlower_z = aligned_P_z * nrdir_z; + const ssef tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; - const ssef tupper_x = tlower_x - nrdir_x, - tupper_y = tlower_y - nrdir_y, - tupper_z = tlower_z - nrdir_z; + const ssef tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; #ifdef __KERNEL_SSE41__ - const ssef tnear_x = mini(tlower_x, tupper_x); - const ssef tnear_y = mini(tlower_y, tupper_y); - const ssef tnear_z = mini(tlower_z, tupper_z); - const ssef tfar_x = maxi(tlower_x, tupper_x); - const ssef tfar_y = maxi(tlower_y, tupper_y); - const ssef tfar_z = maxi(tlower_z, tupper_z); - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const sseb vmask = tnear <= tfar; - *dist = tnear; - return movemask(vmask); + const ssef tnear_x = mini(tlower_x, tupper_x); + const ssef tnear_y = mini(tlower_y, tupper_y); + const ssef tnear_z = mini(tlower_z, tupper_z); + const ssef tfar_x = maxi(tlower_x, tupper_x); + const ssef tfar_y = maxi(tlower_y, tupper_y); + const ssef tfar_z = maxi(tlower_z, tupper_z); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); #else - const ssef tnear_x = min(tlower_x, tupper_x); - const ssef tnear_y = min(tlower_y, tupper_y); - const ssef tnear_z = min(tlower_z, tupper_z); - const ssef tfar_x = max(tlower_x, tupper_x); - const ssef tfar_y = max(tlower_y, tupper_y); - const ssef tfar_z = max(tlower_z, tupper_z); - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const sseb vmask = tnear <= tfar; - *dist = tnear; - return movemask(vmask); + const ssef tnear_x = min(tlower_x, tupper_x); + const ssef tnear_y = min(tlower_y, tupper_y); + const ssef tnear_z = min(tlower_z, tupper_z); + const ssef tfar_x = max(tlower_x, tupper_x); + const ssef tfar_y = max(tlower_y, tupper_y); + const ssef tfar_z = max(tlower_z, tupper_z); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); #endif } -ccl_device_inline int qbvh_unaligned_node_intersect_robust( - KernelGlobals *ccl_restrict kg, - const ssef& isect_near, - const ssef& isect_far, +ccl_device_inline int qbvh_unaligned_node_intersect_robust(KernelGlobals *ccl_restrict kg, + const ssef &isect_near, + const ssef &isect_far, #ifdef __KERNEL_AVX2__ - const sse3f& P_idir, + const sse3f &P_idir, #endif - const sse3f& P, - const sse3f& dir, - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - const float difl, - ssef *ccl_restrict dist) + const sse3f &P, + const sse3f &dir, + const sse3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + ssef *ccl_restrict dist) { - const int offset = node_addr; - const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); - const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); - const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); + const int offset = node_addr; + const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 1); + const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 2); + const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 3); - const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); - const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); - const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); + const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 4); + const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 5); + const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 6); - const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); - const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); - const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); + const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 7); + const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 8); + const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 9); - const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); - const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); - const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); + const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 10); + const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 11); + const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 12); - const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, - aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, - aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + const ssef aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z, + aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z, + aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z; - const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x, - aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y, - aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z; + const ssef aligned_P_x = P.x * tfm_x_x + P.y * tfm_x_y + P.z * tfm_x_z + tfm_t_x, + aligned_P_y = P.x * tfm_y_x + P.y * tfm_y_y + P.z * tfm_y_z + tfm_t_y, + aligned_P_z = P.x * tfm_z_x + P.y * tfm_z_y + P.z * tfm_z_z + tfm_t_z; - const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); - const ssef nrdir_x = neg_one / aligned_dir_x, - nrdir_y = neg_one / aligned_dir_y, - nrdir_z = neg_one / aligned_dir_z; + const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); + const ssef nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; - const ssef tlower_x = aligned_P_x * nrdir_x, - tlower_y = aligned_P_y * nrdir_y, - tlower_z = aligned_P_z * nrdir_z; + const ssef tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; - const ssef tupper_x = tlower_x - nrdir_x, - tupper_y = tlower_y - nrdir_y, - tupper_z = tlower_z - nrdir_z; + const ssef tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; #ifdef __KERNEL_SSE41__ - const ssef tnear_x = mini(tlower_x, tupper_x); - const ssef tnear_y = mini(tlower_y, tupper_y); - const ssef tnear_z = mini(tlower_z, tupper_z); - const ssef tfar_x = maxi(tlower_x, tupper_x); - const ssef tfar_y = maxi(tlower_y, tupper_y); - const ssef tfar_z = maxi(tlower_z, tupper_z); + const ssef tnear_x = mini(tlower_x, tupper_x); + const ssef tnear_y = mini(tlower_y, tupper_y); + const ssef tnear_z = mini(tlower_z, tupper_z); + const ssef tfar_x = maxi(tlower_x, tupper_x); + const ssef tfar_y = maxi(tlower_y, tupper_y); + const ssef tfar_z = maxi(tlower_z, tupper_z); #else - const ssef tnear_x = min(tlower_x, tupper_x); - const ssef tnear_y = min(tlower_y, tupper_y); - const ssef tnear_z = min(tlower_z, tupper_z); - const ssef tfar_x = max(tlower_x, tupper_x); - const ssef tfar_y = max(tlower_y, tupper_y); - const ssef tfar_z = max(tlower_z, tupper_z); + const ssef tnear_x = min(tlower_x, tupper_x); + const ssef tnear_y = min(tlower_y, tupper_y); + const ssef tnear_z = min(tlower_z, tupper_z); + const ssef tfar_x = max(tlower_x, tupper_x); + const ssef tfar_y = max(tlower_y, tupper_y); + const ssef tfar_z = max(tlower_z, tupper_z); #endif - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const sseb vmask = round_down*tnear <= round_up*tfar; - *dist = tnear; - return movemask(vmask); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = round_down * tnear <= round_up * tfar; + *dist = tnear; + return movemask(vmask); } /* Intersectors wrappers. @@ -356,111 +392,125 @@ ccl_device_inline int qbvh_unaligned_node_intersect_robust( * They'll check node type and call appropriate intersection code. */ -ccl_device_inline int qbvh_node_intersect( - KernelGlobals *ccl_restrict kg, - const ssef& isect_near, - const ssef& isect_far, +ccl_device_inline int qbvh_node_intersect(KernelGlobals *ccl_restrict kg, + const ssef &isect_near, + const ssef &isect_far, #ifdef __KERNEL_AVX2__ - const sse3f& org_idir, + const sse3f &org_idir, #endif - const sse3f& org, - const sse3f& dir, - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - ssef *ccl_restrict dist) + const sse3f &org, + const sse3f &dir, + const sse3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + ssef *ccl_restrict dist) { - const int offset = node_addr; - const float4 node = kernel_tex_fetch(__bvh_nodes, offset); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return qbvh_unaligned_node_intersect(kg, - isect_near, - isect_far, + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return qbvh_unaligned_node_intersect(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - org_idir, + org_idir, #endif - org, - dir, - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - dist); - } - else { - return qbvh_aligned_node_intersect(kg, - isect_near, - isect_far, + org, + dir, + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + dist); + } + else { + return qbvh_aligned_node_intersect(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - org_idir, + org_idir, #else - org, + org, #endif - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - dist); - } + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + dist); + } } -ccl_device_inline int qbvh_node_intersect_robust( - KernelGlobals *ccl_restrict kg, - const ssef& isect_near, - const ssef& isect_far, +ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *ccl_restrict kg, + const ssef &isect_near, + const ssef &isect_far, #ifdef __KERNEL_AVX2__ - const sse3f& P_idir, + const sse3f &P_idir, #endif - const sse3f& P, - const sse3f& dir, - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - const float difl, - ssef *ccl_restrict dist) + const sse3f &P, + const sse3f &dir, + const sse3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + ssef *ccl_restrict dist) { - const int offset = node_addr; - const float4 node = kernel_tex_fetch(__bvh_nodes, offset); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return qbvh_unaligned_node_intersect_robust(kg, - isect_near, - isect_far, + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return qbvh_unaligned_node_intersect_robust(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - P_idir, + P_idir, #endif - P, - dir, - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - difl, - dist); - } - else { - return qbvh_aligned_node_intersect_robust(kg, - isect_near, - isect_far, + P, + dir, + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + difl, + dist); + } + else { + return qbvh_aligned_node_intersect_robust(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - P_idir, + P_idir, #else - P, + P, #endif - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - difl, - dist); - } + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + difl, + dist); + } } diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index dd977fb9e74..49e607bfbd0 100644 --- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -36,439 +36,424 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const uint max_hits, uint *num_hits) { - /* TODO(sergey): - * - Test if pushing distance on the stack helps. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; + /* TODO(sergey): + * - Test if pushing distance on the stack helps. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - *num_hits = 0; - isect_array->t = tmax; - + *num_hits = 0; + isect_array->t = tmax; #if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; + int num_hits_in_instance = 0; #endif - ssef tnear(0.0f), tfar(isect_t); + ssef tnear(0.0f), tfar(isect_t); #if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); #endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - (void) inodes; - - if(false + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + (void)inodes; + + if (false #ifdef __VISIBILITY_FLAG__ - || ((__float_as_uint(inodes.x) & visibility) == 0) + || ((__float_as_uint(inodes.x) & visibility) == 0) #endif #if BVH_FEATURE(BVH_MOTION) - || UNLIKELY(ray->time < inodes.y) - || UNLIKELY(ray->time > inodes.z) + || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z) #endif - ) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ssef dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + ) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ssef dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); - - if(child_mask != 0) { - float4 cnodes; + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); + + if (child_mask != 0) { + float4 cnodes; #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); + } + else #endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float*)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float *)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + if ((__float_as_uint(leaf.z) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } #endif - int prim_addr = __float_as_int(leaf.x); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - while(prim_addr < prim_addr2) { - kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - bool hit; - - /* todo: specialized intersect functions which don't fill in - * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? - * might give a few % performance improvement */ - - switch(p_type) { - case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr); - break; - } + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + while (prim_addr < prim_addr2) { + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); + bool hit; + + /* todo: specialized intersect functions which don't fill in + * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? + * might give a few % performance improvement */ + + switch (p_type) { + case PRIMITIVE_TRIANGLE: { + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect(kg, - isect_array, - P, - dir, - ray->time, - visibility, - object, - prim_addr); - break; - } + case PRIMITIVE_MOTION_TRIANGLE: { + hit = motion_triangle_intersect( + kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + break; + } #endif #if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); - } - else { - hit = curve_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); - } - break; - } + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, + 0); + } + else { + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, + 0); + } + break; + } #endif - default: { - hit = false; - break; - } - } + default: { + hit = false; + break; + } + } - /* Shadow ray early termination. */ - if(hit) { - /* detect if this surface has a shader with transparent shadows */ + /* Shadow ray early termination. */ + if (hit) { + /* detect if this surface has a shader with transparent shadows */ - /* todo: optimize so primitive visibility flag indicates if - * the primitive has a transparent shadow shader? */ - int prim = kernel_tex_fetch(__prim_index, isect_array->prim); - int shader = 0; + /* todo: optimize so primitive visibility flag indicates if + * the primitive has a transparent shadow shader? */ + int prim = kernel_tex_fetch(__prim_index, isect_array->prim); + int shader = 0; #ifdef __HAIR__ - if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) + if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) #endif - { - shader = kernel_tex_fetch(__tri_shader, prim); - } + { + shader = kernel_tex_fetch(__tri_shader, prim); + } #ifdef __HAIR__ - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } #endif - int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - - /* if no transparent shadows, all light is blocked */ - if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { - return true; - } - /* if maximum number of hits reached, block all light */ - else if(*num_hits == max_hits) { - return true; - } - - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + + /* if no transparent shadows, all light is blocked */ + if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return true; + } + /* if maximum number of hits reached, block all light */ + else if (*num_hits == max_hits) { + return true; + } + + /* move on to next entry in intersections array */ + isect_array++; + (*num_hits)++; #if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; #endif - isect_array->t = isect_t; - } + isect_array->t = isect_t; + } - prim_addr++; - } - } + prim_addr++; + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); # if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - num_hits_in_instance = 0; - isect_array->t = isect_t; + num_hits_in_instance = 0; + isect_array->t = isect_t; - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = ssef(isect_t); + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = ssef(isect_t); # if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); # endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + node_addr = kernel_tex_fetch(__object_node, object); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ - if(num_hits_in_instance) { - float t_fac; + /* Instance pop. */ + if (num_hits_in_instance) { + float t_fac; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); # else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - /* Scale isect->t to adjust for instancing. */ - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } - else { + /* Scale isect->t to adjust for instancing. */ + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } + else { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - } + } - isect_t = tmax; - isect_array->t = isect_t; + isect_t = tmax; + isect_array->t = isect_t; - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = ssef(isect_t); + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = ssef(isect_t); # if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); # endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return false; + return false; } #undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h index 40cd57aad34..9ee0f7b5933 100644 --- a/intern/cycles/kernel/bvh/qbvh_traversal.h +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -37,457 +37,446 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, Intersection *isect, const uint visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - ,uint *lcg_state, + , + uint *lcg_state, float difl, float extmax #endif - ) +) { - /* TODO(sergey): - * - Test if pushing distance on the stack helps (for non shadow rays). - * - Separate version for shadow rays. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - traversal_stack[0].dist = -FLT_MAX; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - float node_dist = -FLT_MAX; - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; + /* TODO(sergey): + * - Test if pushing distance on the stack helps (for non shadow rays). + * - Separate version for shadow rays. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + traversal_stack[0].dist = -FLT_MAX; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + float node_dist = -FLT_MAX; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; - BVH_DEBUG_INIT(); + BVH_DEBUG_INIT(); - ssef tnear(0.0f), tfar(ray->t); + ssef tnear(0.0f), tfar(ray->t); #if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); #endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - (void) inodes; - - if(UNLIKELY(node_dist > isect->t) + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + (void)inodes; + + if (UNLIKELY(node_dist > isect->t) #if BVH_FEATURE(BVH_MOTION) - || UNLIKELY(ray->time < inodes.y) - || UNLIKELY(ray->time > inodes.z) + || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z) #endif #ifdef __VISIBILITY_FLAG__ - || (__float_as_uint(inodes.x) & visibility) == 0 + || (__float_as_uint(inodes.x) & visibility) == 0 #endif - ) - { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } + ) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } - int child_mask; - ssef dist; + int child_mask; + ssef dist; - BVH_DEBUG_NEXT_NODE(); + BVH_DEBUG_NEXT_NODE(); #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - if(difl != 0.0f) { - /* NOTE: We extend all the child BB instead of fetching - * and checking visibility flags for each of the, - * - * Need to test if doing opposite would be any faster. - */ - child_mask = NODE_INTERSECT_ROBUST(kg, - tnear, - tfar, + if (difl != 0.0f) { + /* NOTE: We extend all the child BB instead of fetching + * and checking visibility flags for each of the, + * + * Need to test if doing opposite would be any faster. + */ + child_mask = NODE_INTERSECT_ROBUST(kg, + tnear, + tfar, # ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, # endif # if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, # endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - difl, - &dist); - } - else -#endif /* BVH_HAIR_MINIMUM_WIDTH */ - { - child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + difl, + &dist); + } + else +#endif /* BVH_HAIR_MINIMUM_WIDTH */ + { + child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); - } - - if(child_mask != 0) { - float4 cnodes; - /* TODO(sergey): Investigate whether moving cnodes upwards - * gives a speedup (will be different cache pattern but will - * avoid extra check here). - */ + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); + } + + if (child_mask != 0) { + float4 cnodes; + /* TODO(sergey): Investigate whether moving cnodes upwards + * gives a speedup (will be different cache pattern but will + * avoid extra check here). + */ #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); + } + else #endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - float d0 = ((float*)&dist)[r]; - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - node_dist = d0; - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - node_dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - node_dist = d0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } - - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + float d0 = ((float *)&dist)[r]; + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + node_dist = d0; + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + node_dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + node_dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + } + + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); #ifdef __VISIBILITY_FLAG__ - if(UNLIKELY((node_dist > isect->t) || - ((__float_as_uint(leaf.z) & visibility) == 0))) + if (UNLIKELY((node_dist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0))) #else - if(UNLIKELY((node_dist > isect->t))) + if (UNLIKELY((node_dist > isect->t))) #endif - { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } - int prim_addr = __float_as_int(leaf.x); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - - /* Primitive intersection. */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(triangle_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr)) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if(visibility & PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + + /* Primitive intersection. */ + switch (type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) { + tfar = ssef(isect->t); + /* Shadow ray early termination. */ + if (visibility & PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if(motion_triangle_intersect(kg, - isect, - P, - dir, - ray->time, - visibility, - object, - prim_addr)) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if(visibility & PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } -#endif /* BVH_FEATURE(BVH_MOTION) */ + case PRIMITIVE_MOTION_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if (motion_triangle_intersect( + kg, isect, P, dir, ray->time, visibility, object, prim_addr)) { + tfar = ssef(isect->t); + /* Shadow ray early termination. */ + if (visibility & PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_MOTION) */ #if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - for(; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); - bool hit; - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); - } - else { - hit = curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); - } - if(hit) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if(visibility & PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } -#endif /* BVH_FEATURE(BVH_HAIR) */ - } - } + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + for (; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); + bool hit; + if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + else { + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + if (hit) { + tfar = ssef(isect->t); + /* Shadow ray early termination. */ + if (visibility & PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_HAIR) */ + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); # if BVH_FEATURE(BVH_MOTION) - qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); + qbvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); # else - qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); + qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); # endif - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = ssef(isect->t); + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = ssef(isect->t); # if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); # endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - traversal_stack[stack_ptr].dist = -FLT_MAX; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + traversal_stack[stack_ptr].dist = -FLT_MAX; - node_addr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); - BVH_DEBUG_NEXT_INSTANCE(); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + BVH_DEBUG_NEXT_INSTANCE(); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ + /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = ssef(isect->t); + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = ssef(isect->t); # if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); # endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return (isect->prim != PRIM_NONE); + return (isect->prim != PRIM_NONE); } #undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h index 6790bfa6c83..e4eaed04467 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume.h +++ b/intern/cycles/kernel/bvh/qbvh_volume.h @@ -33,331 +33,335 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, Intersection *isect, const uint visibility) { - /* TODO(sergey): - * - Test if pushing distance on the stack helps. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; + /* TODO(sergey): + * - Test if pushing distance on the stack helps. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; - ssef tnear(0.0f), tfar(ray->t); + ssef tnear(0.0f), tfar(ray->t); #if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); #endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(inodes.x) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + if ((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } #endif - ssef dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + ssef dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); - - if(child_mask != 0) { - float4 cnodes; + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); + + if (child_mask != 0) { + float4 cnodes; #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); + } + else #endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float*)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - - if((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - int prim_addr = __float_as_int(leaf.x); + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float *)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + + if ((__float_as_uint(leaf.z) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch(p_type) { - case PRIMITIVE_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); - } - break; - } + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch (p_type) { + case PRIMITIVE_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr); - } - break; - } + case PRIMITIVE_MOTION_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + motion_triangle_intersect( + kg, isect, P, dir, ray->time, visibility, object, prim_addr); + } + break; + } #endif - } - } + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if (object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = ssef(isect->t); + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = ssef(isect->t); # if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); # endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* Pop. */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ + /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = ssef(isect->t); + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = ssef(isect->t); # if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); # endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return (isect->prim != PRIM_NONE); + return (isect->prim != PRIM_NONE); } #undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h index 63d79b6fe34..eddc48c487e 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume_all.h +++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h @@ -34,405 +34,411 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const uint max_hits, const uint visibility) { - /* TODO(sergey): - * - Test if pushing distance on the stack helps. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; + /* TODO(sergey): + * - Test if pushing distance on the stack helps. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; #if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; + Transform ob_itfm; #endif - uint num_hits = 0; - isect_array->t = tmax; + uint num_hits = 0; + isect_array->t = tmax; #if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; + int num_hits_in_instance = 0; #endif - ssef tnear(0.0f), tfar(isect_t); + ssef tnear(0.0f), tfar(isect_t); #if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); #endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); + float3 P_idir = P * idir; + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(inodes.x) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } + if ((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } #endif - ssef dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, + ssef dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, + P_idir4, #endif #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, + org4, #endif #if BVH_FEATURE(BVH_HAIR) - dir4, + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - &dist); - - if(child_mask != 0) { - float4 cnodes; + idir4, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + &dist); + + if (child_mask != 0) { + float4 cnodes; #if BVH_FEATURE(BVH_HAIR) - if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); - } - else + if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); + } + else #endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if(child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float*)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(child_mask == 0) { - if(d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if(node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); - - if((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - int prim_addr = __float_as_int(leaf.x); + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if (child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float *)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float *)&dist)[r]; + if (child_mask == 0) { + if (d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float *)&dist)[r]; + if (child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float *)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if (node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); + + if ((__float_as_uint(leaf.z) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(prim_addr >= 0) { + if (prim_addr >= 0) { #endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - bool hit; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch(p_type) { - case PRIMITIVE_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); - if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + bool hit; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch (p_type) { + case PRIMITIVE_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); + if (hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; #if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; #endif - isect_array->t = isect_t; - if(num_hits == max_hits) { + isect_array->t = isect_t; + if (num_hits == max_hits) { #if BVH_FEATURE(BVH_INSTANCING) - if(object != OBJECT_NONE) { + if (object != OBJECT_NONE) { # if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); # else - Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); + Transform itfm = object_fetch_transform( + kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); # endif - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } -#endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } #if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for(; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); - if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; + case PRIMITIVE_MOTION_TRIANGLE: { + for (; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, prim_addr) : + object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = motion_triangle_intersect( + kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + if (hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; # if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; + num_hits_in_instance++; # endif - isect_array->t = isect_t; - if(num_hits == max_hits) { + isect_array->t = isect_t; + if (num_hits == max_hits) { # if BVH_FEATURE(BVH_INSTANCING) - if(object != OBJECT_NONE) { + if (object != OBJECT_NONE) { # if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); # else - Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); + Transform itfm = object_fetch_transform( + kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); # endif - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } -# endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } +# endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } #endif - } - } + } + } #if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr-1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if(object_flag & SD_OBJECT_HAS_VOLUME) { + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr - 1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if (object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = ssef(isect_t); - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = ssef(isect_t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); # endif # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - num_hits_in_instance = 0; - isect_array->t = isect_t; - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* Pop. */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + num_hits_in_instance = 0; + isect_array->t = isect_t; + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); + if (stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); - /* Instance pop. */ - if(num_hits_in_instance) { - float t_fac; + /* Instance pop. */ + if (num_hits_in_instance) { + float t_fac; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); # else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - /* Scale isect->t to adjust for instancing. */ - for(int i = 0; i < num_hits_in_instance; i++) { - (isect_array-i-1)->t *= t_fac; - } - } - else { + /* Scale isect->t to adjust for instancing. */ + for (int i = 0; i < num_hits_in_instance; i++) { + (isect_array - i - 1)->t *= t_fac; + } + } + else { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - } + } - isect_t = tmax; - isect_array->t = isect_t; + isect_t = tmax; + isect_array->t = isect_t; - qbvh_near_far_idx_calc(idir, - &near_x, &near_y, &near_z, - &far_x, &far_y, &far_z); - tfar = ssef(isect_t); + qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); + tfar = ssef(isect_t); # if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); # endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); + P_idir = P * idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); # endif # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(node_addr != ENTRYPOINT_SENTINEL); + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while (node_addr != ENTRYPOINT_SENTINEL); - return num_hits; + return num_hits; } #undef NODE_INTERSECT |