Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/bvh')
-rw-r--r--intern/cycles/kernel/bvh/bvh.h424
-rw-r--r--intern/cycles/kernel/bvh/bvh_nodes.h656
-rw-r--r--intern/cycles/kernel/bvh/bvh_shadow_all.h420
-rw-r--r--intern/cycles/kernel/bvh/bvh_subsurface.h265
-rw-r--r--intern/cycles/kernel/bvh/bvh_traversal.h466
-rw-r--r--intern/cycles/kernel/bvh/bvh_volume.h336
-rw-r--r--intern/cycles/kernel/bvh/bvh_volume_all.h411
-rw-r--r--intern/cycles/kernel/bvh/qbvh_nodes.h433
-rw-r--r--intern/cycles/kernel/bvh/qbvh_shadow_all.h485
-rw-r--r--intern/cycles/kernel/bvh/qbvh_subsurface.h298
-rw-r--r--intern/cycles/kernel/bvh/qbvh_traversal.h505
-rw-r--r--intern/cycles/kernel/bvh/qbvh_volume.h374
-rw-r--r--intern/cycles/kernel/bvh/qbvh_volume_all.h448
13 files changed, 5521 insertions, 0 deletions
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
new file mode 100644
index 00000000000..59881738195
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -0,0 +1,424 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* BVH
+ *
+ * Bounding volume hierarchy for ray tracing. We compile different variations
+ * of the same BVH traversal function for faster rendering when some types of
+ * primitives are not needed, using #includes to work around the lack of
+ * C++ templates in OpenCL.
+ *
+ * Originally based on "Understanding the Efficiency of Ray Traversal on GPUs",
+ * the code has been extended and modified to support more primitives and work
+ * with CPU/CUDA/OpenCL. */
+
+CCL_NAMESPACE_BEGIN
+
+/* Don't inline intersect functions on GPU, this is faster */
+#ifdef __KERNEL_GPU__
+# define ccl_device_intersect ccl_device_noinline
+#else
+# define ccl_device_intersect ccl_device_inline
+#endif
+
+/* bottom-most stack entry, indicating the end of traversal */
+#define ENTRYPOINT_SENTINEL 0x76543210
+
+/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
+#define BVH_STACK_SIZE 192
+#define BVH_QSTACK_SIZE 384
+
+/* BVH intersection function variations */
+
+#define BVH_INSTANCING 1
+#define BVH_MOTION 2
+#define BVH_HAIR 4
+#define BVH_HAIR_MINIMUM_WIDTH 8
+
+#define BVH_NAME_JOIN(x,y) x ## _ ## y
+#define BVH_NAME_EVAL(x,y) BVH_NAME_JOIN(x,y)
+#define BVH_FUNCTION_FULL_NAME(prefix) BVH_NAME_EVAL(prefix, BVH_FUNCTION_NAME)
+
+#define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+/* Debugging heleprs */
+#ifdef __KERNEL_DEBUG__
+# define BVH_DEBUG_INIT() \
+ do { \
+ isect->num_traversal_steps = 0; \
+ isect->num_traversed_instances = 0; \
+ } while(0)
+# define BVH_DEBUG_NEXT_STEP() \
+ do { \
+ ++isect->num_traversal_steps; \
+ } while(0)
+# define BVH_DEBUG_NEXT_INSTANCE() \
+ do { \
+ ++isect->num_traversed_instances; \
+ } while(0)
+#else /* __KERNEL_DEBUG__ */
+# define BVH_DEBUG_INIT()
+# define BVH_DEBUG_NEXT_STEP()
+# define BVH_DEBUG_NEXT_INSTANCE()
+#endif /* __KERNEL_DEBUG__ */
+
+
+/* Common QBVH functions. */
+#ifdef __QBVH__
+# include "qbvh_nodes.h"
+#endif
+
+/* Regular BVH traversal */
+
+#include "bvh_nodes.h"
+
+#define BVH_FUNCTION_NAME bvh_intersect
+#define BVH_FUNCTION_FEATURES 0
+#include "bvh_traversal.h"
+
+#if defined(__INSTANCING__)
+# define BVH_FUNCTION_NAME bvh_intersect_instancing
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING
+# include "bvh_traversal.h"
+#endif
+
+#if defined(__HAIR__)
+# define BVH_FUNCTION_NAME bvh_intersect_hair
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+# include "bvh_traversal.h"
+#endif
+
+#if defined(__OBJECT_MOTION__)
+# define BVH_FUNCTION_NAME bvh_intersect_motion
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+# include "bvh_traversal.h"
+#endif
+
+#if defined(__HAIR__) && defined(__OBJECT_MOTION__)
+# define BVH_FUNCTION_NAME bvh_intersect_hair_motion
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+# include "bvh_traversal.h"
+#endif
+
+/* Subsurface scattering BVH traversal */
+
+#if defined(__SUBSURFACE__)
+# define BVH_FUNCTION_NAME bvh_intersect_subsurface
+# define BVH_FUNCTION_FEATURES BVH_HAIR
+# include "bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+# define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
+# define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
+# include "bvh_subsurface.h"
+#endif
+
+/* Volume BVH traversal */
+
+#if defined(__VOLUME__)
+# define BVH_FUNCTION_NAME bvh_intersect_volume
+# define BVH_FUNCTION_FEATURES BVH_HAIR
+# include "bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__INSTANCING__)
+# define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+# include "bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
+# define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
+# include "bvh_volume.h"
+#endif
+
+/* Record all intersections - Shadow BVH traversal */
+
+#if defined(__SHADOW_RECORD_ALL__)
+# define BVH_FUNCTION_NAME bvh_intersect_shadow_all
+# define BVH_FUNCTION_FEATURES 0
+# include "bvh_shadow_all.h"
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__)
+# define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING
+# include "bvh_shadow_all.h"
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__)
+# define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+# include "bvh_shadow_all.h"
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__)
+# define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+# include "bvh_shadow_all.h"
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+# define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
+# include "bvh_shadow_all.h"
+#endif
+
+/* Record all intersections - Volume BVH traversal */
+
+#if defined(__VOLUME_RECORD_ALL__)
+# define BVH_FUNCTION_NAME bvh_intersect_volume_all
+# define BVH_FUNCTION_FEATURES BVH_HAIR
+# include "bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
+# define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+# include "bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
+# define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
+# include "bvh_volume_all.h"
+#endif
+
+#undef BVH_FEATURE
+#undef BVH_NAME_JOIN
+#undef BVH_NAME_EVAL
+#undef BVH_FUNCTION_FULL_NAME
+
+ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
+ const Ray *ray,
+ const uint visibility,
+ Intersection *isect,
+ uint *lcg_state,
+ float difl,
+ float extmax)
+{
+#ifdef __OBJECT_MOTION__
+ if(kernel_data.bvh.have_motion) {
+# ifdef __HAIR__
+ if(kernel_data.bvh.have_curves)
+ return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
+# endif /* __HAIR__ */
+
+ return bvh_intersect_motion(kg, ray, isect, visibility);
+ }
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+ if(kernel_data.bvh.have_curves)
+ return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+# ifdef __INSTANCING__
+ if(kernel_data.bvh.have_instancing)
+ return bvh_intersect_instancing(kg, ray, isect, visibility);
+# endif /* __INSTANCING__ */
+
+ return bvh_intersect(kg, ray, isect, visibility);
+#else /* __KERNEL_CPU__ */
+
+# ifdef __INSTANCING__
+ return bvh_intersect_instancing(kg, ray, isect, visibility);
+# else
+ return bvh_intersect(kg, ray, isect, visibility);
+# endif /* __INSTANCING__ */
+
+#endif /* __KERNEL_CPU__ */
+}
+
+#ifdef __SUBSURFACE__
+ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
+ const Ray *ray,
+ SubsurfaceIntersection *ss_isect,
+ int subsurface_object,
+ uint *lcg_state,
+ int max_hits)
+{
+#ifdef __OBJECT_MOTION__
+ if(kernel_data.bvh.have_motion) {
+ return bvh_intersect_subsurface_motion(kg,
+ ray,
+ ss_isect,
+ subsurface_object,
+ lcg_state,
+ max_hits);
+ }
+#endif /* __OBJECT_MOTION__ */
+ return bvh_intersect_subsurface(kg,
+ ray,
+ ss_isect,
+ subsurface_object,
+ lcg_state,
+ max_hits);
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+{
+# ifdef __OBJECT_MOTION__
+ if(kernel_data.bvh.have_motion) {
+# ifdef __HAIR__
+ if(kernel_data.bvh.have_curves)
+ return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
+# endif /* __HAIR__ */
+
+ return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
+ }
+# endif /* __OBJECT_MOTION__ */
+
+# ifdef __HAIR__
+ if(kernel_data.bvh.have_curves)
+ return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
+# endif /* __HAIR__ */
+
+# ifdef __INSTANCING__
+ if(kernel_data.bvh.have_instancing)
+ return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+# endif /* __INSTANCING__ */
+
+ return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+}
+#endif /* __SHADOW_RECORD_ALL__ */
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility)
+{
+# ifdef __OBJECT_MOTION__
+ if(kernel_data.bvh.have_motion) {
+ return bvh_intersect_volume_motion(kg, ray, isect, visibility);
+ }
+# endif /* __OBJECT_MOTION__ */
+# ifdef __KERNEL_CPU__
+# ifdef __INSTANCING__
+ if(kernel_data.bvh.have_instancing)
+ return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
+# endif /* __INSTANCING__ */
+ return bvh_intersect_volume(kg, ray, isect, visibility);
+# else /* __KERNEL_CPU__ */
+# ifdef __INSTANCING__
+ return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
+# else
+ return bvh_intersect_volume(kg, ray, isect, visibility);
+# endif /* __INSTANCING__ */
+# endif /* __KERNEL_CPU__ */
+}
+#endif /* __VOLUME__ */
+
+#ifdef __VOLUME_RECORD_ALL__
+ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint max_hits,
+ const uint visibility)
+{
+# ifdef __OBJECT_MOTION__
+ if(kernel_data.bvh.have_motion) {
+ return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
+ }
+# endif /* __OBJECT_MOTION__ */
+# ifdef __INSTANCING__
+ if(kernel_data.bvh.have_instancing)
+ return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility);
+# endif /* __INSTANCING__ */
+ return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
+}
+#endif /* __VOLUME_RECORD_ALL__ */
+
+
+/* Ray offset to avoid self intersection.
+ *
+ * This function should be used to compute a modified ray start position for
+ * rays leaving from a surface. */
+
+ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
+{
+#ifdef __INTERSECTION_REFINE__
+ const float epsilon_f = 1e-5f;
+ /* ideally this should match epsilon_f, but instancing and motion blur
+ * precision makes it problematic */
+ const float epsilon_test = 1.0f;
+ const int epsilon_i = 32;
+
+ float3 res;
+
+ /* x component */
+ if(fabsf(P.x) < epsilon_test) {
+ res.x = P.x + Ng.x*epsilon_f;
+ }
+ else {
+ uint ix = __float_as_uint(P.x);
+ ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
+ res.x = __uint_as_float(ix);
+ }
+
+ /* y component */
+ if(fabsf(P.y) < epsilon_test) {
+ res.y = P.y + Ng.y*epsilon_f;
+ }
+ else {
+ uint iy = __float_as_uint(P.y);
+ iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
+ res.y = __uint_as_float(iy);
+ }
+
+ /* z component */
+ if(fabsf(P.z) < epsilon_test) {
+ res.z = P.z + Ng.z*epsilon_f;
+ }
+ else {
+ uint iz = __float_as_uint(P.z);
+ iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
+ res.z = __uint_as_float(iz);
+ }
+
+ return res;
+#else
+ const float epsilon_f = 1e-4f;
+ return P + epsilon_f*Ng;
+#endif
+}
+
+#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+/* ToDo: Move to another file? */
+ccl_device int intersections_compare(const void *a, const void *b)
+{
+ const Intersection *isect_a = (const Intersection*)a;
+ const Intersection *isect_b = (const Intersection*)b;
+
+ if(isect_a->t < isect_b->t)
+ return -1;
+ else if(isect_a->t > isect_b->t)
+ return 1;
+ else
+ return 0;
+}
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
new file mode 100644
index 00000000000..db2275b0ff8
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright 2011-2016, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
+// 3-vector which might be faster.
+ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ int node_addr,
+ int child)
+{
+ Transform space;
+ const int child_addr = node_addr + child * 3;
+ space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
+ space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
+ space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
+ space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+ return space;
+}
+
+#if !defined(__KERNEL_SSE2__)
+ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ const float3 P,
+ const float3 idir,
+ const float t,
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+
+ /* fetch node data */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1);
+ float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2);
+ float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3);
+
+ /* intersect ray against child nodes */
+ float c0lox = (node0.x - P.x) * idir.x;
+ float c0hix = (node0.z - P.x) * idir.x;
+ float c0loy = (node1.x - P.y) * idir.y;
+ float c0hiy = (node1.z - P.y) * idir.y;
+ float c0loz = (node2.x - P.z) * idir.z;
+ float c0hiz = (node2.z - P.z) * idir.z;
+ float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+ float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+ float c1lox = (node0.y - P.x) * idir.x;
+ float c1hix = (node0.w - P.x) * idir.x;
+ float c1loy = (node1.y - P.y) * idir.y;
+ float c1hiy = (node1.w - P.y) * idir.y;
+ float c1loz = (node2.y - P.z) * idir.z;
+ float c1hiz = (node2.w - P.z) * idir.z;
+ float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+ float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+ dist[0] = c0min;
+ dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+ return ((c0max >= c0min)? 1: 0) |
+ ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+ const float3 P,
+ const float3 idir,
+ const float t,
+ const float difl,
+ const float extmax,
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+
+ /* fetch node data */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1);
+ float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2);
+ float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3);
+
+ /* intersect ray against child nodes */
+ float c0lox = (node0.x - P.x) * idir.x;
+ float c0hix = (node0.z - P.x) * idir.x;
+ float c0loy = (node1.x - P.y) * idir.y;
+ float c0hiy = (node1.z - P.y) * idir.y;
+ float c0loz = (node2.x - P.z) * idir.z;
+ float c0hiz = (node2.z - P.z) * idir.z;
+ float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+ float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+ float c1lox = (node0.y - P.x) * idir.x;
+ float c1hix = (node0.w - P.x) * idir.x;
+ float c1loy = (node1.y - P.y) * idir.y;
+ float c1hiy = (node1.w - P.y) * idir.y;
+ float c1loz = (node2.y - P.z) * idir.z;
+ float c1hiz = (node2.w - P.z) * idir.z;
+ float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+ float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+ if(difl != 0.0f) {
+ float hdiff = 1.0f + difl;
+ float ldiff = 1.0f - difl;
+ if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+ c0min = max(ldiff * c0min, c0min - extmax);
+ c0max = min(hdiff * c0max, c0max + extmax);
+ }
+ if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+ c1min = max(ldiff * c1min, c1min - extmax);
+ c1max = min(hdiff * c1max, c1max + extmax);
+ }
+ }
+
+ dist[0] = c0min;
+ dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+ return ((c0max >= c0min)? 1: 0) |
+ ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child(
+ KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float t,
+ int node_addr,
+ int child,
+ float dist[2])
+{
+ Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child);
+ float3 aligned_dir = transform_direction(&space, dir);
+ float3 aligned_P = transform_point(&space, P);
+ float3 nrdir = -bvh_inverse_direction(aligned_dir);
+ float3 lower_xyz = aligned_P * nrdir;
+ float3 upper_xyz = lower_xyz - nrdir;
+ const float near_x = min(lower_xyz.x, upper_xyz.x);
+ const float near_y = min(lower_xyz.y, upper_xyz.y);
+ const float near_z = min(lower_xyz.z, upper_xyz.z);
+ const float far_x = max(lower_xyz.x, upper_xyz.x);
+ const float far_y = max(lower_xyz.y, upper_xyz.y);
+ const float far_z = max(lower_xyz.z, upper_xyz.z);
+ const float tnear = max4(0.0f, near_x, near_y, near_z);
+ const float tfar = min4(t, far_x, far_y, far_z);
+ *dist = tnear;
+ return tnear <= tfar;
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+ KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float t,
+ const float difl,
+ int node_addr,
+ int child,
+ float dist[2])
+{
+ Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child);
+ float3 aligned_dir = transform_direction(&space, dir);
+ float3 aligned_P = transform_point(&space, P);
+ float3 nrdir = -bvh_inverse_direction(aligned_dir);
+ float3 tLowerXYZ = aligned_P * nrdir;
+ float3 tUpperXYZ = tLowerXYZ - nrdir;
+ const float near_x = min(tLowerXYZ.x, tUpperXYZ.x);
+ const float near_y = min(tLowerXYZ.y, tUpperXYZ.y);
+ const float near_z = min(tLowerXYZ.z, tUpperXYZ.z);
+ const float far_x = max(tLowerXYZ.x, tUpperXYZ.x);
+ const float far_y = max(tLowerXYZ.y, tUpperXYZ.y);
+ const float far_z = max(tLowerXYZ.z, tUpperXYZ.z);
+ const float tnear = max4(0.0f, near_x, near_y, near_z);
+ const float tfar = min4(t, far_x, far_y, far_z);
+ *dist = tnear;
+ if(difl != 0.0f) {
+ /* TODO(sergey): Same as for QBVH, needs a proper use. */
+ const float round_down = 1.0f - difl;
+ const float round_up = 1.0f + difl;
+ return round_down*tnear <= round_up*tfar;
+ }
+ else {
+ return tnear <= tfar;
+ }
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float3 idir,
+ const float t,
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+ int mask = 0;
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(cnodes.x) & visibility))
+#endif
+ {
+ mask |= 1;
+ }
+ }
+ if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(cnodes.y) & visibility))
+#endif
+ {
+ mask |= 2;
+ }
+ }
+ return mask;
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float3 idir,
+ const float t,
+ const float difl,
+ const float extmax,
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+ int mask = 0;
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(cnodes.x) & visibility))
+#endif
+ {
+ mask |= 1;
+ }
+ }
+ if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(cnodes.y) & visibility))
+#endif
+ {
+ mask |= 2;
+ }
+ }
+ return mask;
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float3 idir,
+ const float t,
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+ float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return bvh_unaligned_node_intersect(kg,
+ P,
+ dir,
+ idir,
+ t,
+ node_addr,
+ visibility,
+ dist);
+ }
+ else {
+ return bvh_aligned_node_intersect(kg,
+ P,
+ idir,
+ t,
+ node_addr,
+ visibility,
+ dist);
+ }
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float3 idir,
+ const float t,
+ const float difl,
+ const float extmax,
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+ float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return bvh_unaligned_node_intersect_robust(kg,
+ P,
+ dir,
+ idir,
+ t,
+ difl,
+ extmax,
+ node_addr,
+ visibility,
+ dist);
+ }
+ else {
+ return bvh_aligned_node_intersect_robust(kg,
+ P,
+ idir,
+ t,
+ difl,
+ extmax,
+ node_addr,
+ visibility,
+ dist);
+ }
+}
+#else /* !defined(__KERNEL_SSE2__) */
+
+int ccl_device_inline bvh_aligned_node_intersect(
+ KernelGlobals *kg,
+ const float3& P,
+ const float3& dir,
+ const ssef& tsplat,
+ const ssef Psplat[3],
+ const ssef idirsplat[3],
+ const shuffle_swap_t shufflexyz[3],
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+ /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+ /* fetch node data */
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + node_addr;
+
+ /* intersect ray against child nodes */
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+ /* calculate { c0min, c1min, -c0max, -c1max} */
+ ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+ dist[0] = tminmax[0];
+ dist[1] = tminmax[1];
+
+ int mask = movemask(lrhit);
+
+# ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+ return cmask;
+# else
+ return mask & 3;
+# endif
+}
+
+int ccl_device_inline bvh_aligned_node_intersect_robust(
+ KernelGlobals *kg,
+ const float3& P,
+ const float3& dir,
+ const ssef& tsplat,
+ const ssef Psplat[3],
+ const ssef idirsplat[3],
+ const shuffle_swap_t shufflexyz[3],
+ const float difl,
+ const float extmax,
+ const int nodeAddr,
+ const uint visibility,
+ float dist[2])
+{
+ /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+ /* fetch node data */
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+ /* intersect ray against child nodes */
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+ /* calculate { c0min, c1min, -c0max, -c1max} */
+ ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
+
+ if(difl != 0.0f) {
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ float4 *tminmaxview = (float4*)&tminmax;
+ float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
+ float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
+ float hdiff = 1.0f + difl;
+ float ldiff = 1.0f - difl;
+ if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
+ c0min = max(ldiff * c0min, c0min - extmax);
+ c0max = min(hdiff * c0max, c0max + extmax);
+ }
+ if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
+ c1min = max(ldiff * c1min, c1min - extmax);
+ c1max = min(hdiff * c1max, c1max + extmax);
+ }
+ }
+
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+ dist[0] = tminmax[0];
+ dist[1] = tminmax[1];
+
+ int mask = movemask(lrhit);
+
+# ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+ return cmask;
+# else
+ return mask & 3;
+# endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const ssef& isect_near,
+ const ssef& isect_far,
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+ Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
+ Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
+
+ float3 aligned_dir0 = transform_direction(&space0, dir),
+ aligned_dir1 = transform_direction(&space1, dir);;
+ float3 aligned_P0 = transform_point(&space0, P),
+ aligned_P1 = transform_point(&space1, P);
+ float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+ nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+ ssef lower_x = ssef(aligned_P0.x * nrdir0.x,
+ aligned_P1.x * nrdir1.x,
+ 0.0f, 0.0f),
+ lower_y = ssef(aligned_P0.y * nrdir0.y,
+ aligned_P1.y * nrdir1.y,
+ 0.0f,
+ 0.0f),
+ lower_z = ssef(aligned_P0.z * nrdir0.z,
+ aligned_P1.z * nrdir1.z,
+ 0.0f,
+ 0.0f);
+
+ ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+ upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+ upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+ ssef tnear_x = min(lower_x, upper_x);
+ ssef tnear_y = min(lower_y, upper_y);
+ ssef tnear_z = min(lower_z, upper_z);
+ ssef tfar_x = max(lower_x, upper_x);
+ ssef tfar_y = max(lower_y, upper_y);
+ ssef tfar_z = max(lower_z, upper_z);
+
+ const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+ const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ sseb vmask = tnear <= tfar;
+ dist[0] = tnear.f[0];
+ dist[1] = tnear.f[1];
+
+ int mask = (int)movemask(vmask);
+
+# ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+ return cmask;
+# else
+ return mask & 3;
+# endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const ssef& isect_near,
+ const ssef& isect_far,
+ const float difl,
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+ Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
+ Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
+
+ float3 aligned_dir0 = transform_direction(&space0, dir),
+ aligned_dir1 = transform_direction(&space1, dir);;
+ float3 aligned_P0 = transform_point(&space0, P),
+ aligned_P1 = transform_point(&space1, P);
+ float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+ nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+ ssef lower_x = ssef(aligned_P0.x * nrdir0.x,
+ aligned_P1.x * nrdir1.x,
+ 0.0f, 0.0f),
+ lower_y = ssef(aligned_P0.y * nrdir0.y,
+ aligned_P1.y * nrdir1.y,
+ 0.0f,
+ 0.0f),
+ lower_z = ssef(aligned_P0.z * nrdir0.z,
+ aligned_P1.z * nrdir1.z,
+ 0.0f,
+ 0.0f);
+
+ ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+ upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+ upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+ ssef tnear_x = min(lower_x, upper_x);
+ ssef tnear_y = min(lower_y, upper_y);
+ ssef tnear_z = min(lower_z, upper_z);
+ ssef tfar_x = max(lower_x, upper_x);
+ ssef tfar_y = max(lower_y, upper_y);
+ ssef tfar_z = max(lower_z, upper_z);
+
+ const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+ const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ sseb vmask;
+ if(difl != 0.0f) {
+ const float round_down = 1.0f - difl;
+ const float round_up = 1.0f + difl;
+ vmask = round_down*tnear <= round_up*tfar;
+ }
+ else {
+ vmask = tnear <= tfar;
+ }
+
+ dist[0] = tnear.f[0];
+ dist[1] = tnear.f[1];
+
+ int mask = (int)movemask(vmask);
+
+# ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+ return cmask;
+# else
+ return mask & 3;
+# endif
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ const float3& P,
+ const float3& dir,
+ const ssef& isect_near,
+ const ssef& isect_far,
+ const ssef& tsplat,
+ const ssef Psplat[3],
+ const ssef idirsplat[3],
+ const shuffle_swap_t shufflexyz[3],
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+ float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return bvh_unaligned_node_intersect(kg,
+ P,
+ dir,
+ isect_near,
+ isect_far,
+ node_addr,
+ visibility,
+ dist);
+ }
+ else {
+ return bvh_aligned_node_intersect(kg,
+ P,
+ dir,
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ node_addr,
+ visibility,
+ dist);
+ }
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ const float3& P,
+ const float3& dir,
+ const ssef& isect_near,
+ const ssef& isect_far,
+ const ssef& tsplat,
+ const ssef Psplat[3],
+ const ssef idirsplat[3],
+ const shuffle_swap_t shufflexyz[3],
+ const float difl,
+ const float extmax,
+ const int node_addr,
+ const uint visibility,
+ float dist[2])
+{
+ float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return bvh_unaligned_node_intersect_robust(kg,
+ P,
+ dir,
+ isect_near,
+ isect_far,
+ difl,
+ node_addr,
+ visibility,
+ dist);
+ }
+ else {
+ return bvh_aligned_node_intersect_robust(kg,
+ P,
+ dir,
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ difl,
+ extmax,
+ node_addr,
+ visibility,
+ dist);
+ }
+}
+#endif /* !defined(__KERNEL_SSE2__) */
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
new file mode 100644
index 00000000000..1869457f0c3
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -0,0 +1,420 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+# include "qbvh_shadow_all.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect_array,
+ const uint max_hits,
+ uint *num_hits)
+{
+ /* todo:
+ * - likely and unlikely for if() statements
+ * - test restrict attribute for pointers
+ */
+
+ /* traversal stack in CUDA thread-local memory */
+ int traversal_stack[BVH_STACK_SIZE];
+ traversal_stack[0] = ENTRYPOINT_SENTINEL;
+
+ /* traversal variables in registers */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* ray parameters in registers */
+ const float tmax = ray->t;
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+ float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ int num_hits_in_instance = 0;
+#endif
+
+ *num_hits = 0;
+ isect_array->t = tmax;
+
+#if defined(__KERNEL_SSE2__)
+ const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+ const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect_t);
+# endif
+ shuffle_swap_t shufflexyz[3];
+
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif /* __KERNEL_SSE2__ */
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* traversal loop */
+ do {
+ do {
+ /* traverse internal nodes */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ int node_addr_ahild1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#if !defined(__KERNEL_SSE2__)
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect_t,
+ node_addr,
+ PATH_RAY_SHADOW,
+ dist);
+#else // __KERNEL_SSE2__
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ node_addr,
+ PATH_RAY_SHADOW,
+ dist);
+#endif // __KERNEL_SSE2__
+
+ node_addr = __float_as_int(cnodes.z);
+ node_addr_ahild1 = __float_as_int(cnodes.w);
+
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool is_closest_child1 = (dist[1] < dist[0]);
+ if(is_closest_child1) {
+ int tmp = node_addr;
+ node_addr = node_addr_ahild1;
+ node_addr_ahild1 = tmp;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_STACK_SIZE);
+ traversal_stack[stack_ptr] = node_addr_ahild1;
+ }
+ else {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
+ node_addr = node_addr_ahild1;
+ }
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+ }
+ }
+
+ /* if node is leaf, fetch triangle list */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ const int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+ const uint p_type = type & PRIMITIVE_ALL;
+
+ /* pop */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+
+ /* primitive intersection */
+ while(prim_addr < prim_addr2) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+
+ bool hit;
+
+ /* todo: specialized intersect functions which don't fill in
+ * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+ * might give a few % performance improvement */
+
+ switch(p_type) {
+ case PRIMITIVE_TRIANGLE: {
+ hit = triangle_intersect(kg,
+ &isect_precalc,
+ isect_array,
+ P,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr);
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ hit = motion_triangle_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ ray->time,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr);
+ break;
+ }
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ case PRIMITIVE_CURVE:
+ case PRIMITIVE_MOTION_CURVE: {
+ if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+ hit = bvh_cardinal_curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr,
+ ray->time,
+ type,
+ NULL,
+ 0, 0);
+ }
+ else {
+ hit = bvh_curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr,
+ ray->time,
+ type,
+ NULL,
+ 0, 0);
+ }
+ break;
+ }
+#endif
+ default: {
+ hit = false;
+ break;
+ }
+ }
+
+ /* shadow ray early termination */
+ if(hit) {
+ /* Update number of hits now, so we do proper check on max bounces. */
+ (*num_hits)++;
+
+ /* detect if this surface has a shader with transparent shadows */
+
+ /* todo: optimize so primitive visibility flag indicates if
+ * the primitive has a transparent shadow shader? */
+ int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+ int shader = 0;
+
+#ifdef __HAIR__
+ if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+ {
+ shader = kernel_tex_fetch(__tri_shader, prim);
+ }
+#ifdef __HAIR__
+ else {
+ float4 str = kernel_tex_fetch(__curves, prim);
+ shader = __float_as_int(str.z);
+ }
+#endif
+ int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+
+ /* if no transparent shadows, all light is blocked */
+ if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+ return true;
+ }
+ /* if maximum number of hits reached, block all light */
+ else if(*num_hits == max_hits) {
+ return true;
+ }
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+#endif
+ /* Move on to next entry in intersections array */
+ isect_array++;
+ }
+
+ prim_addr++;
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* instance push */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+# else
+ bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+ num_hits_in_instance = 0;
+ isect_array->t = isect_t;
+
+# if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect_t);
+# endif
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+# endif
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_STACK_SIZE);
+ traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ if(num_hits_in_instance) {
+ float t_fac;
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+# else
+ bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* scale isect->t to adjust for instancing */
+ for(int i = 0; i < num_hits_in_instance; i++)
+ (isect_array-i-1)->t *= t_fac;
+ }
+ else {
+ float ignore_t = FLT_MAX;
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+# endif
+ triangle_intersect_precalc(dir, &isect_precalc);
+ }
+
+ isect_t = tmax;
+ isect_array->t = isect_t;
+
+# if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect_t);
+# endif
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+# endif
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return false;
+}
+
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect_array,
+ const uint max_hits,
+ uint *num_hits)
+{
+#ifdef __QBVH__
+ if(kernel_data.bvh.use_qbvh) {
+ return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+ ray,
+ isect_array,
+ max_hits,
+ num_hits);
+ }
+ else
+#endif
+ {
+ kernel_assert(kernel_data.bvh.use_qbvh == false);
+ return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+ ray,
+ isect_array,
+ max_hits,
+ num_hits);
+ }
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h
new file mode 100644
index 00000000000..18978efcfa3
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_subsurface.h
@@ -0,0 +1,265 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+# include "qbvh_subsurface.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ const Ray *ray,
+ SubsurfaceIntersection *ss_isect,
+ int subsurface_object,
+ uint *lcg_state,
+ int max_hits)
+{
+ /* todo:
+ * - test if pushing distance on the stack helps (for non shadow rays)
+ * - separate version for shadow rays
+ * - likely and unlikely for if() statements
+ * - test restrict attribute for pointers
+ */
+
+ /* traversal stack in CUDA thread-local memory */
+ int traversal_stack[BVH_STACK_SIZE];
+ traversal_stack[0] = ENTRYPOINT_SENTINEL;
+
+ /* traversal variables in registers */
+ int stack_ptr = 0;
+ int node_addr = kernel_tex_fetch(__object_node, subsurface_object);
+
+ /* ray parameters in registers */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+ float isect_t = ray->t;
+
+ ss_isect->num_hits = 0;
+
+ const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
+ if(!(object_flag & SD_TRANSFORM_APPLIED)) {
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+ bvh_instance_motion_push(kg,
+ subsurface_object,
+ ray,
+ &P,
+ &dir,
+ &idir,
+ &isect_t,
+ &ob_itfm);
+#else
+ bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+#endif
+ object = subsurface_object;
+ }
+
+#if defined(__KERNEL_SSE2__)
+ const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+ const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect_t);
+# endif
+ shuffle_swap_t shufflexyz[3];
+
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* traversal loop */
+ do {
+ do {
+ /* traverse internal nodes */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ int node_addr_child1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#if !defined(__KERNEL_SSE2__)
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect_t,
+ node_addr,
+ PATH_RAY_ALL_VISIBILITY,
+ dist);
+#else // __KERNEL_SSE2__
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ node_addr,
+ PATH_RAY_ALL_VISIBILITY,
+ dist);
+#endif // __KERNEL_SSE2__
+
+ node_addr = __float_as_int(cnodes.z);
+ node_addr_child1 = __float_as_int(cnodes.w);
+
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool is_closest_child1 = (dist[1] < dist[0]);
+ if(is_closest_child1) {
+ int tmp = node_addr;
+ node_addr = node_addr_child1;
+ node_addr_child1 = tmp;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_STACK_SIZE);
+ traversal_stack[stack_ptr] = node_addr_child1;
+ }
+ else {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
+ node_addr = node_addr_child1;
+ }
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+ }
+ }
+
+ /* if node is leaf, fetch triangle list */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+ int prim_addr = __float_as_int(leaf.x);
+
+ const int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+
+ /* pop */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+
+ /* primitive intersection */
+ switch(type & PRIMITIVE_ALL) {
+ case PRIMITIVE_TRIANGLE: {
+ /* intersect ray against primitive */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ triangle_intersect_subsurface(kg,
+ &isect_precalc,
+ ss_isect,
+ P,
+ object,
+ prim_addr,
+ isect_t,
+ lcg_state,
+ max_hits);
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ /* intersect ray against primitive */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ motion_triangle_intersect_subsurface(kg,
+ ss_isect,
+ P,
+ dir,
+ ray->time,
+ object,
+ prim_addr,
+ isect_t,
+ lcg_state,
+ max_hits);
+ }
+ break;
+ }
+#endif
+ default: {
+ break;
+ }
+ }
+ }
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+}
+
+ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
+ const Ray *ray,
+ SubsurfaceIntersection *ss_isect,
+ int subsurface_object,
+ uint *lcg_state,
+ int max_hits)
+{
+#ifdef __QBVH__
+ if(kernel_data.bvh.use_qbvh) {
+ return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+ ray,
+ ss_isect,
+ subsurface_object,
+ lcg_state,
+ max_hits);
+ }
+ else
+#endif
+ {
+ kernel_assert(kernel_data.bvh.use_qbvh == false);
+ return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+ ray,
+ ss_isect,
+ subsurface_object,
+ lcg_state,
+ max_hits);
+ }
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
new file mode 100644
index 00000000000..68a11b65ad7
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -0,0 +1,466 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+# include "qbvh_traversal.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+# define NODE_INTERSECT_ROBUST bvh_node_intersect_robust
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
+# define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust
+#endif
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ , uint *lcg_state,
+ float difl,
+ float extmax
+#endif
+ )
+{
+ /* todo:
+ * - test if pushing distance on the stack helps (for non shadow rays)
+ * - separate version for shadow rays
+ * - likely and unlikely for if() statements
+ * - test restrict attribute for pointers
+ */
+
+ /* traversal stack in CUDA thread-local memory */
+ int traversal_stack[BVH_STACK_SIZE];
+ traversal_stack[0] = ENTRYPOINT_SENTINEL;
+
+ /* traversal variables in registers */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* ray parameters in registers */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+ isect->t = ray->t;
+ isect->u = 0.0f;
+ isect->v = 0.0f;
+ isect->prim = PRIM_NONE;
+ isect->object = OBJECT_NONE;
+
+ BVH_DEBUG_INIT();
+
+#if defined(__KERNEL_SSE2__)
+ const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+ const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect->t);
+# endif
+ shuffle_swap_t shufflexyz[3];
+
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* traversal loop */
+ do {
+ do {
+ /* traverse internal nodes */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ int node_addr_child1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#if !defined(__KERNEL_SSE2__)
+# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ if(difl != 0.0f) {
+ traverse_mask = NODE_INTERSECT_ROBUST(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect->t,
+ difl,
+ extmax,
+ node_addr,
+ visibility,
+ dist);
+ }
+ else
+# endif
+ {
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect->t,
+ node_addr,
+ visibility,
+ dist);
+ }
+#else // __KERNEL_SSE2__
+# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ if(difl != 0.0f) {
+ traverse_mask = NODE_INTERSECT_ROBUST(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ difl,
+ extmax,
+ node_addr,
+ visibility,
+ dist);
+ }
+ else
+# endif
+ {
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ node_addr,
+ visibility,
+ dist);
+ }
+#endif // __KERNEL_SSE2__
+
+ node_addr = __float_as_int(cnodes.z);
+ node_addr_child1 = __float_as_int(cnodes.w);
+
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool is_closest_child1 = (dist[1] < dist[0]);
+ if(is_closest_child1) {
+ int tmp = node_addr;
+ node_addr = node_addr_child1;
+ node_addr_child1 = tmp;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_STACK_SIZE);
+ traversal_stack[stack_ptr] = node_addr_child1;
+ }
+ else {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
+ node_addr = node_addr_child1;
+ }
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+ }
+ BVH_DEBUG_NEXT_STEP();
+ }
+
+ /* if node is leaf, fetch triangle list */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ const int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+
+ /* pop */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+
+ /* primitive intersection */
+ switch(type & PRIMITIVE_ALL) {
+ case PRIMITIVE_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ BVH_DEBUG_NEXT_STEP();
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ if(triangle_intersect(kg,
+ &isect_precalc,
+ isect,
+ P,
+ visibility,
+ object,
+ prim_addr))
+ {
+ /* shadow ray early termination */
+#if defined(__KERNEL_SSE2__)
+ if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ return true;
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
+#else
+ if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ return true;
+#endif
+ }
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ BVH_DEBUG_NEXT_STEP();
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ if(motion_triangle_intersect(kg,
+ isect,
+ P,
+ dir,
+ ray->time,
+ visibility,
+ object,
+ prim_addr))
+ {
+ /* shadow ray early termination */
+# if defined(__KERNEL_SSE2__)
+ if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ return true;
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
+# else
+ if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ return true;
+# endif
+ }
+ }
+ break;
+ }
+#endif /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+ case PRIMITIVE_CURVE:
+ case PRIMITIVE_MOTION_CURVE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ BVH_DEBUG_NEXT_STEP();
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ bool hit;
+ if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+ hit = bvh_cardinal_curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ type,
+ lcg_state,
+ difl,
+ extmax);
+ }
+ else {
+ hit = bvh_curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ type,
+ lcg_state,
+ difl,
+ extmax);
+ }
+ if(hit) {
+ /* shadow ray early termination */
+# if defined(__KERNEL_SSE2__)
+ if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ return true;
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
+# else
+ if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ return true;
+# endif
+ }
+ }
+ break;
+ }
+#endif /* BVH_FEATURE(BVH_HAIR) */
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* instance push */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+# else
+ bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+# endif
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+# if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+# endif
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_STACK_SIZE);
+ traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+
+ BVH_DEBUG_NEXT_INSTANCE();
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* instance pop */
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+# endif
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+# if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+# endif
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return (isect->prim != PRIM_NONE);
+}
+
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ , uint *lcg_state,
+ float difl,
+ float extmax
+#endif
+ )
+{
+#ifdef __QBVH__
+ if(kernel_data.bvh.use_qbvh) {
+ return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+ ray,
+ isect,
+ visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ , lcg_state,
+ difl,
+ extmax
+#endif
+ );
+ }
+ else
+#endif
+ {
+ kernel_assert(kernel_data.bvh.use_qbvh == false);
+ return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+ ray,
+ isect,
+ visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ , lcg_state,
+ difl,
+ extmax
+#endif
+ );
+ }
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
new file mode 100644
index 00000000000..03499e94347
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -0,0 +1,336 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+# include "qbvh_volume.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility)
+{
+ /* todo:
+ * - test if pushing distance on the stack helps (for non shadow rays)
+ * - separate version for shadow rays
+ * - likely and unlikely for if() statements
+ * - test restrict attribute for pointers
+ */
+
+ /* traversal stack in CUDA thread-local memory */
+ int traversal_stack[BVH_STACK_SIZE];
+ traversal_stack[0] = ENTRYPOINT_SENTINEL;
+
+ /* traversal variables in registers */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* ray parameters in registers */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+ isect->t = ray->t;
+ isect->u = 0.0f;
+ isect->v = 0.0f;
+ isect->prim = PRIM_NONE;
+ isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_SSE2__)
+ const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+ const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect->t);
+# endif
+ shuffle_swap_t shufflexyz[3];
+
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* traversal loop */
+ do {
+ do {
+ /* traverse internal nodes */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ int node_addr_child1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#if !defined(__KERNEL_SSE2__)
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect->t,
+ node_addr,
+ visibility,
+ dist);
+#else // __KERNEL_SSE2__
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ node_addr,
+ visibility,
+ dist);
+#endif // __KERNEL_SSE2__
+
+ node_addr = __float_as_int(cnodes.z);
+ node_addr_child1 = __float_as_int(cnodes.w);
+
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool is_closest_child1 = (dist[1] < dist[0]);
+ if(is_closest_child1) {
+ int tmp = node_addr;
+ node_addr = node_addr_child1;
+ node_addr_child1 = tmp;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_STACK_SIZE);
+ traversal_stack[stack_ptr] = node_addr_child1;
+ }
+ else {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
+ node_addr = node_addr_child1;
+ }
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+ }
+ }
+
+ /* if node is leaf, fetch triangle list */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ const int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+
+ /* pop */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+
+ /* primitive intersection */
+ switch(type & PRIMITIVE_ALL) {
+ case PRIMITIVE_TRIANGLE: {
+ /* intersect ray against primitive */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* only primitives from volume object */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ triangle_intersect(kg,
+ &isect_precalc,
+ isect,
+ P,
+ visibility,
+ object,
+ prim_addr);
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ /* intersect ray against primitive */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* only primitives from volume object */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ motion_triangle_intersect(kg,
+ isect,
+ P,
+ dir,
+ ray->time,
+ visibility,
+ object,
+ prim_addr);
+ }
+ break;
+ }
+#endif
+ default: {
+ break;
+ }
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* instance push */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+ int object_flag = kernel_tex_fetch(__object_flag, object);
+
+ if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+# else
+ bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+# if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+# endif
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_STACK_SIZE);
+ traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+ }
+ else {
+ /* pop */
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* instance pop */
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+# if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+# endif
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_MOTION) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return (isect->prim != PRIM_NONE);
+}
+
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility)
+{
+#ifdef __QBVH__
+ if(kernel_data.bvh.use_qbvh) {
+ return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+ ray,
+ isect,
+ visibility);
+ }
+ else
+#endif
+ {
+ kernel_assert(kernel_data.bvh.use_qbvh == false);
+ return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+ ray,
+ isect,
+ visibility);
+ }
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
new file mode 100644
index 00000000000..b5405e8e57b
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -0,0 +1,411 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+# include "qbvh_volume_all.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect_array,
+ const uint max_hits,
+ const uint visibility)
+{
+ /* todo:
+ * - test if pushing distance on the stack helps (for non shadow rays)
+ * - separate version for shadow rays
+ * - likely and unlikely for if() statements
+ * - test restrict attribute for pointers
+ */
+
+ /* traversal stack in CUDA thread-local memory */
+ int traversal_stack[BVH_STACK_SIZE];
+ traversal_stack[0] = ENTRYPOINT_SENTINEL;
+
+ /* traversal variables in registers */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* ray parameters in registers */
+ const float tmax = ray->t;
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+ float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ int num_hits_in_instance = 0;
+#endif
+
+ uint num_hits = 0;
+ isect_array->t = tmax;
+
+#if defined(__KERNEL_SSE2__)
+ const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+ const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect_t);
+# endif
+ shuffle_swap_t shufflexyz[3];
+
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* traversal loop */
+ do {
+ do {
+ /* traverse internal nodes */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ int node_addr_child1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#if !defined(__KERNEL_SSE2__)
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect_t,
+ node_addr,
+ visibility,
+ dist);
+#else // __KERNEL_SSE2__
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ node_addr,
+ visibility,
+ dist);
+#endif // __KERNEL_SSE2__
+
+ node_addr = __float_as_int(cnodes.z);
+ node_addr_child1 = __float_as_int(cnodes.w);
+
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool is_closest_child1 = (dist[1] < dist[0]);
+ if(is_closest_child1) {
+ int tmp = node_addr;
+ node_addr = node_addr_child1;
+ node_addr_child1 = tmp;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_STACK_SIZE);
+ traversal_stack[stack_ptr] = node_addr_child1;
+ }
+ else {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
+ node_addr = node_addr_child1;
+ }
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+ }
+ }
+
+ /* if node is leaf, fetch triangle list */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ const int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+ bool hit;
+
+ /* pop */
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+
+ /* primitive intersection */
+ switch(type & PRIMITIVE_ALL) {
+ case PRIMITIVE_TRIANGLE: {
+ /* intersect ray against primitive */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* only primitives from volume object */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ hit = triangle_intersect(kg,
+ &isect_precalc,
+ isect_array,
+ P,
+ visibility,
+ object,
+ prim_addr);
+ if(hit) {
+ /* Update number of hits now, so we do proper check on max bounces. */
+ num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+#endif
+ if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+# if BVH_FEATURE(BVH_MOTION)
+ float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+# else
+ Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+ float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+# endif
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+#endif /* BVH_FEATURE(BVH_INSTANCING) */
+ return num_hits;
+ }
+ /* Move on to next entry in intersections array */
+ isect_array++;
+ isect_array->t = isect_t;
+ }
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ /* intersect ray against primitive */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* only primitives from volume object */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ hit = motion_triangle_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ ray->time,
+ visibility,
+ object,
+ prim_addr);
+ if(hit) {
+ /* Update number of hits now, so we do proper check on max bounces. */
+ num_hits++;
+# if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+# endif
+ if(num_hits == max_hits) {
+# if BVH_FEATURE(BVH_INSTANCING)
+# if BVH_FEATURE(BVH_MOTION)
+ float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+# else
+ Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+ float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+# endif
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+# endif /* BVH_FEATURE(BVH_INSTANCING) */
+ return num_hits;
+ }
+ /* Move on to next entry in intersections array */
+ isect_array++;
+ isect_array->t = isect_t;
+ }
+ }
+ break;
+ }
+#endif /* BVH_MOTION */
+ default: {
+ break;
+ }
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* instance push */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+ int object_flag = kernel_tex_fetch(__object_flag, object);
+
+ if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+# else
+ bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+ num_hits_in_instance = 0;
+ isect_array->t = isect_t;
+
+# if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect_t);
+# endif
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+# endif
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_STACK_SIZE);
+ traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+ }
+ else {
+ /* pop */
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ if(num_hits_in_instance) {
+ float t_fac;
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+# else
+ bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+# endif
+ triangle_intersect_precalc(dir, &isect_precalc);
+ /* Scale isect->t to adjust for instancing. */
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+ }
+ else {
+ float ignore_t = FLT_MAX;
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+# endif
+ triangle_intersect_precalc(dir, &isect_precalc);
+ }
+
+ isect_t = tmax;
+ isect_array->t = isect_t;
+
+# if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect_t);
+# endif
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+# endif
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr];
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_MOTION) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return num_hits;
+}
+
+ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect_array,
+ const uint max_hits,
+ const uint visibility)
+{
+#ifdef __QBVH__
+ if(kernel_data.bvh.use_qbvh) {
+ return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+ ray,
+ isect_array,
+ max_hits,
+ visibility);
+ }
+ else
+#endif
+ {
+ kernel_assert(kernel_data.bvh.use_qbvh == false);
+ return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+ ray,
+ isect_array,
+ max_hits,
+ visibility);
+ }
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
new file mode 100644
index 00000000000..4d8695bedec
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -0,0 +1,433 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+struct QBVHStackItem {
+ int addr;
+ float dist;
+};
+
+/* TOOD(sergey): Investigate if using intrinsics helps for both
+ * stack item swap and float comparison.
+ */
+ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a,
+ QBVHStackItem *ccl_restrict b)
+{
+ QBVHStackItem tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
+ QBVHStackItem *ccl_restrict s2,
+ QBVHStackItem *ccl_restrict s3)
+{
+ if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+ if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+ if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
+ QBVHStackItem *ccl_restrict s2,
+ QBVHStackItem *ccl_restrict s3,
+ QBVHStackItem *ccl_restrict s4)
+{
+ if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+ if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); }
+ if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); }
+ if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); }
+ if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+}
+
+/* Axis-aligned nodes intersection */
+
+ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+ const ssef& isect_near,
+ const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+ const sse3f& org_idir,
+#else
+ const sse3f& org,
+#endif
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ ssef *ccl_restrict dist)
+{
+ const int offset = node_addr + 1;
+#ifdef __KERNEL_AVX2__
+ const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x);
+ const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y);
+ const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z);
+ const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x);
+ const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y);
+ const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z);
+#else
+ const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x;
+ const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y;
+ const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z;
+ const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x;
+ const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y;
+ const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z;
+#endif
+
+#ifdef __KERNEL_SSE41__
+ const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near));
+ const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far));
+ const sseb vmask = cast(tnear) > cast(tfar);
+ int mask = (int)movemask(vmask)^0xf;
+#else
+ const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+ const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const sseb vmask = tnear <= tfar;
+ int mask = (int)movemask(vmask);
+#endif
+ *dist = tnear;
+ return mask;
+}
+
+ccl_device_inline int qbvh_aligned_node_intersect_robust(
+ KernelGlobals *ccl_restrict kg,
+ const ssef& isect_near,
+ const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+ const sse3f& P_idir,
+#else
+ const sse3f& P,
+#endif
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ const float difl,
+ ssef *ccl_restrict dist)
+{
+ const int offset = node_addr + 1;
+#ifdef __KERNEL_AVX2__
+ const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x);
+ const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y);
+ const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z);
+ const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x);
+ const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y);
+ const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z);
+#else
+ const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x;
+ const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y;
+ const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z;
+ const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x;
+ const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y;
+ const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z;
+#endif
+
+ const float round_down = 1.0f - difl;
+ const float round_up = 1.0f + difl;
+ const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+ const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const sseb vmask = round_down*tnear <= round_up*tfar;
+ *dist = tnear;
+ return (int)movemask(vmask);
+}
+
+/* Unaligned nodes intersection */
+
+ccl_device_inline int qbvh_unaligned_node_intersect(
+ KernelGlobals *ccl_restrict kg,
+ const ssef& isect_near,
+ const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+ const sse3f& org_idir,
+#endif
+ const sse3f& org,
+ const sse3f& dir,
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ ssef *ccl_restrict dist)
+{
+ const int offset = node_addr;
+ const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+ const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+ const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+ const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+ const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+ const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+ const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+ const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+ const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+ const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+ const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+ const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+ const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+ aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+ aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+ const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
+ aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
+ aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
+
+ const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+ const ssef nrdir_x = neg_one / aligned_dir_x,
+ nrdir_y = neg_one / aligned_dir_y,
+ nrdir_z = neg_one / aligned_dir_z;
+
+ const ssef tlower_x = aligned_P_x * nrdir_x,
+ tlower_y = aligned_P_y * nrdir_y,
+ tlower_z = aligned_P_z * nrdir_z;
+
+ const ssef tupper_x = tlower_x - nrdir_x,
+ tupper_y = tlower_y - nrdir_y,
+ tupper_z = tlower_z - nrdir_z;
+
+#ifdef __KERNEL_SSE41__
+ const ssef tnear_x = mini(tlower_x, tupper_x);
+ const ssef tnear_y = mini(tlower_y, tupper_y);
+ const ssef tnear_z = mini(tlower_z, tupper_z);
+ const ssef tfar_x = maxi(tlower_x, tupper_x);
+ const ssef tfar_y = maxi(tlower_y, tupper_y);
+ const ssef tfar_z = maxi(tlower_z, tupper_z);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+ const sseb vmask = tnear <= tfar;
+ *dist = tnear;
+ return movemask(vmask);
+#else
+ const ssef tnear_x = min(tlower_x, tupper_x);
+ const ssef tnear_y = min(tlower_y, tupper_y);
+ const ssef tnear_z = min(tlower_z, tupper_z);
+ const ssef tfar_x = max(tlower_x, tupper_x);
+ const ssef tfar_y = max(tlower_y, tupper_y);
+ const ssef tfar_z = max(tlower_z, tupper_z);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+ const sseb vmask = tnear <= tfar;
+ *dist = tnear;
+ return movemask(vmask);
+#endif
+}
+
+ccl_device_inline int qbvh_unaligned_node_intersect_robust(
+ KernelGlobals *ccl_restrict kg,
+ const ssef& isect_near,
+ const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+ const sse3f& P_idir,
+#endif
+ const sse3f& P,
+ const sse3f& dir,
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ const float difl,
+ ssef *ccl_restrict dist)
+{
+ const int offset = node_addr;
+ const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+ const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+ const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+ const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+ const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+ const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+ const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+ const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+ const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+ const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+ const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+ const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+ const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+ aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+ aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+ const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
+ aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
+ aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
+
+ const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+ const ssef nrdir_x = neg_one / aligned_dir_x,
+ nrdir_y = neg_one / aligned_dir_y,
+ nrdir_z = neg_one / aligned_dir_z;
+
+ const ssef tlower_x = aligned_P_x * nrdir_x,
+ tlower_y = aligned_P_y * nrdir_y,
+ tlower_z = aligned_P_z * nrdir_z;
+
+ const ssef tupper_x = tlower_x - nrdir_x,
+ tupper_y = tlower_y - nrdir_y,
+ tupper_z = tlower_z - nrdir_z;
+
+ const float round_down = 1.0f - difl;
+ const float round_up = 1.0f + difl;
+
+#ifdef __KERNEL_SSE41__
+ const ssef tnear_x = mini(tlower_x, tupper_x);
+ const ssef tnear_y = mini(tlower_y, tupper_y);
+ const ssef tnear_z = mini(tlower_z, tupper_z);
+ const ssef tfar_x = maxi(tlower_x, tupper_x);
+ const ssef tfar_y = maxi(tlower_y, tupper_y);
+ const ssef tfar_z = maxi(tlower_z, tupper_z);
+#else
+ const ssef tnear_x = min(tlower_x, tupper_x);
+ const ssef tnear_y = min(tlower_y, tupper_y);
+ const ssef tnear_z = min(tlower_z, tupper_z);
+ const ssef tfar_x = max(tlower_x, tupper_x);
+ const ssef tfar_y = max(tlower_y, tupper_y);
+ const ssef tfar_z = max(tlower_z, tupper_z);
+#endif
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+ const sseb vmask = round_down*tnear <= round_up*tfar;
+ *dist = tnear;
+ return movemask(vmask);
+}
+
+/* Intersectors wrappers.
+ *
+ * They'll check node type and call appropriate intersection code.
+ */
+
+ccl_device_inline int qbvh_node_intersect(
+ KernelGlobals *ccl_restrict kg,
+ const ssef& isect_near,
+ const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+ const sse3f& org_idir,
+#endif
+ const sse3f& org,
+ const sse3f& dir,
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ ssef *ccl_restrict dist)
+{
+ const int offset = node_addr;
+ const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return qbvh_unaligned_node_intersect(kg,
+ isect_near,
+ isect_far,
+#ifdef __KERNEL_AVX2__
+ org_idir,
+#endif
+ org,
+ dir,
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ dist);
+ }
+ else {
+ return qbvh_aligned_node_intersect(kg,
+ isect_near,
+ isect_far,
+#ifdef __KERNEL_AVX2__
+ org_idir,
+#else
+ org,
+#endif
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ dist);
+ }
+}
+
+ccl_device_inline int qbvh_node_intersect_robust(
+ KernelGlobals *ccl_restrict kg,
+ const ssef& isect_near,
+ const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+ const sse3f& P_idir,
+#endif
+ const sse3f& P,
+ const sse3f& dir,
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ const float difl,
+ ssef *ccl_restrict dist)
+{
+ const int offset = node_addr;
+ const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return qbvh_unaligned_node_intersect_robust(kg,
+ isect_near,
+ isect_far,
+#ifdef __KERNEL_AVX2__
+ P_idir,
+#endif
+ P,
+ dir,
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ difl,
+ dist);
+ }
+ else {
+ return qbvh_aligned_node_intersect_robust(kg,
+ isect_near,
+ isect_far,
+#ifdef __KERNEL_AVX2__
+ P_idir,
+#else
+ P,
+#endif
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ difl,
+ dist);
+ }
+}
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
new file mode 100644
index 00000000000..34753ff067d
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -0,0 +1,485 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect_array,
+ const uint max_hits,
+ uint *num_hits)
+{
+ /* TODO(sergey):
+ * - Likely and unlikely for if() statements.
+ * - Test restrict attribute for pointers.
+ */
+
+ /* Traversal stack in CUDA thread-local memory. */
+ QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* Ray parameters in registers. */
+ const float tmax = ray->t;
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+ float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+ *num_hits = 0;
+ isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return false;
+ }
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ int num_hits_in_instance = 0;
+#endif
+
+ ssef tnear(0.0f), tfar(tmax);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+ sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+#endif
+
+ ssef dist;
+ int child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+# endif
+# if BVH_FEATURE(BVH_HAIR)
+ dir4,
+# endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+
+ if(child_mask != 0) {
+ float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ float d0 = ((float*)&dist)[r];
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ }
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+#endif
+
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+ const uint p_type = type & PRIMITIVE_ALL;
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ while(prim_addr < prim_addr2) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+
+ bool hit;
+
+ /* todo: specialized intersect functions which don't fill in
+ * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+ * might give a few % performance improvement */
+
+ switch(p_type) {
+ case PRIMITIVE_TRIANGLE: {
+ hit = triangle_intersect(kg,
+ &isect_precalc,
+ isect_array,
+ P,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr);
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ hit = motion_triangle_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ ray->time,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr);
+ break;
+ }
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ case PRIMITIVE_CURVE:
+ case PRIMITIVE_MOTION_CURVE: {
+ if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+ hit = bvh_cardinal_curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr,
+ ray->time,
+ type,
+ NULL,
+ 0, 0);
+ }
+ else {
+ hit = bvh_curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr,
+ ray->time,
+ type,
+ NULL,
+ 0, 0);
+ }
+ break;
+ }
+#endif
+ default: {
+ hit = false;
+ break;
+ }
+ }
+
+ /* Shadow ray early termination. */
+ if(hit) {
+ /* Update number of hits now, so we do proper check on max bounces. */
+ (*num_hits)++;
+
+ /* detect if this surface has a shader with transparent shadows */
+
+ /* todo: optimize so primitive visibility flag indicates if
+ * the primitive has a transparent shadow shader? */
+ int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+ int shader = 0;
+
+#ifdef __HAIR__
+ if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+ {
+ shader = kernel_tex_fetch(__tri_shader, prim);
+ }
+#ifdef __HAIR__
+ else {
+ float4 str = kernel_tex_fetch(__curves, prim);
+ shader = __float_as_int(str.z);
+ }
+#endif
+ int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+
+ /* if no transparent shadows, all light is blocked */
+ if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+ return true;
+ }
+ /* if maximum number of hits reached, block all light */
+ else if(*num_hits == max_hits) {
+ return true;
+ }
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+#endif
+ /* Move on to next entry in intersections array */
+ isect_array++;
+ isect_array->t = isect_t;
+ }
+
+ prim_addr++;
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* Instance push. */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+# else
+ bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+# endif
+
+ num_hits_in_instance = 0;
+ isect_array->t = isect_t;
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+ tfar = ssef(isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
+ idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ if(num_hits_in_instance) {
+ float t_fac;
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+# else
+ bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+# endif
+
+ /* scale isect->t to adjust for instancing */
+ for(int i = 0; i < num_hits_in_instance; i++)
+ (isect_array-i-1)->t *= t_fac;
+ }
+ else {
+ float ignore_t = FLT_MAX;
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+# endif
+ }
+
+ isect_t = tmax;
+ isect_array->t = isect_t;
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+ tfar = ssef(tmax);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
+ idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return false;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h
new file mode 100644
index 00000000000..03794e3a882
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h
@@ -0,0 +1,298 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
+ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ SubsurfaceIntersection *ss_isect,
+ int subsurface_object,
+ uint *lcg_state,
+ int max_hits)
+{
+ /* TODO(sergey):
+ * - Test if pushing distance on the stack helps (for non shadow rays).
+ * - Separate version for shadow rays.
+ * - Likely and unlikely for if() statements.
+ * - SSE for hair.
+ * - Test restrict attribute for pointers.
+ */
+
+ /* Traversal stack in CUDA thread-local memory. */
+ QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_tex_fetch(__object_node, subsurface_object);
+
+ /* Ray parameters in registers. */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+ float isect_t = ray->t;
+
+ ss_isect->num_hits = 0;
+
+ const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
+ if(!(object_flag & SD_TRANSFORM_APPLIED)) {
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+ bvh_instance_motion_push(kg,
+ subsurface_object,
+ ray,
+ &P,
+ &dir,
+ &idir,
+ &isect_t,
+ &ob_itfm);
+#else
+ bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+#endif
+ object = subsurface_object;
+ }
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return;
+ }
+#endif
+
+ ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+ sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ ssef dist;
+ int child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+
+ if(child_mask != 0) {
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ float d0 = ((float*)&dist)[r];
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ }
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+ int prim_addr = __float_as_int(leaf.x);
+
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ switch(type & PRIMITIVE_ALL) {
+ case PRIMITIVE_TRIANGLE: {
+ /* Intersect ray against primitive, */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ triangle_intersect_subsurface(kg,
+ &isect_precalc,
+ ss_isect,
+ P,
+ object,
+ prim_addr,
+ isect_t,
+ lcg_state,
+ max_hits);
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ /* Intersect ray against primitive. */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ motion_triangle_intersect_subsurface(kg,
+ ss_isect,
+ P,
+ dir,
+ ray->time,
+ object,
+ prim_addr,
+ isect_t,
+ lcg_state,
+ max_hits);
+ }
+ break;
+ }
+#endif
+ default:
+ break;
+ }
+ }
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
new file mode 100644
index 00000000000..f82ff661495
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -0,0 +1,505 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+# define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+# define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ ,uint *lcg_state,
+ float difl,
+ float extmax
+#endif
+ )
+{
+ /* TODO(sergey):
+ * - Test if pushing distance on the stack helps (for non shadow rays).
+ * - Separate version for shadow rays.
+ * - Likely and unlikely for if() statements.
+ * - Test restrict attribute for pointers.
+ */
+
+ /* Traversal stack in CUDA thread-local memory. */
+ QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+ traversal_stack[0].dist = -FLT_MAX;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+ float node_dist = -FLT_MAX;
+
+ /* Ray parameters in registers. */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return false;
+ }
+#endif
+
+ isect->t = ray->t;
+ isect->u = 0.0f;
+ isect->v = 0.0f;
+ isect->prim = PRIM_NONE;
+ isect->object = OBJECT_NONE;
+
+ BVH_DEBUG_INIT();
+
+ ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+ sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+ if(UNLIKELY(node_dist > isect->t)
+#ifdef __VISIBILITY_FLAG__
+ || (__float_as_uint(inodes.x) & visibility) == 0)
+#endif
+ {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+ int child_mask;
+ ssef dist;
+
+ BVH_DEBUG_NEXT_STEP();
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ if(difl != 0.0f) {
+ /* NOTE: We extend all the child BB instead of fetching
+ * and checking visibility flags for each of the,
+ *
+ * Need to test if doing opposite would be any faster.
+ */
+ child_mask = NODE_INTERSECT_ROBUST(kg,
+ tnear,
+ tfar,
+# ifdef __KERNEL_AVX2__
+ P_idir4,
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+# endif
+# if BVH_FEATURE(BVH_HAIR)
+ dir4,
+# endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ difl,
+ &dist);
+ }
+ else
+#endif /* BVH_HAIR_MINIMUM_WIDTH */
+ {
+ child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+ }
+
+ if(child_mask != 0) {
+ float4 cnodes;
+ /* TODO(sergey): Investigate whether moving cnodes upwards
+ * gives a speedup (will be different cache pattern but will
+ * avoid extra check here),
+ */
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ float d0 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ node_dist = d0;
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ node_dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ node_dist = d0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ }
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+#ifdef __VISIBILITY_FLAG__
+ if(UNLIKELY((node_dist > isect->t) ||
+ ((__float_as_uint(leaf.z) & visibility) == 0)))
+#else
+ if(UNLIKELY((node_dist > isect->t)))
+#endif
+ {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ switch(type & PRIMITIVE_ALL) {
+ case PRIMITIVE_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ BVH_DEBUG_NEXT_STEP();
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ if(triangle_intersect(kg,
+ &isect_precalc,
+ isect,
+ P,
+ visibility,
+ object,
+ prim_addr)) {
+ tfar = ssef(isect->t);
+ /* Shadow ray early termination. */
+ if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ return true;
+ }
+ }
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ BVH_DEBUG_NEXT_STEP();
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ if(motion_triangle_intersect(kg,
+ isect,
+ P,
+ dir,
+ ray->time,
+ visibility,
+ object,
+ prim_addr)) {
+ tfar = ssef(isect->t);
+ /* Shadow ray early termination. */
+ if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ return true;
+ }
+ }
+ }
+ break;
+ }
+#endif /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+ case PRIMITIVE_CURVE:
+ case PRIMITIVE_MOTION_CURVE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ BVH_DEBUG_NEXT_STEP();
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ bool hit;
+ if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+ hit = bvh_cardinal_curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ type,
+ lcg_state,
+ difl,
+ extmax);
+ }
+ else {
+ hit = bvh_curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ type,
+ lcg_state,
+ difl,
+ extmax);
+ }
+ if(hit) {
+ tfar = ssef(isect->t);
+ /* Shadow ray early termination. */
+ if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ return true;
+ }
+ }
+ }
+ break;
+ }
+#endif /* BVH_FEATURE(BVH_HAIR) */
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* Instance push. */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+# if BVH_FEATURE(BVH_MOTION)
+ qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
+# else
+ qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
+# endif
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+ tfar = ssef(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
+ idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+ traversal_stack[stack_ptr].dist = -FLT_MAX;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+
+ BVH_DEBUG_NEXT_INSTANCE();
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* Instance pop. */
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+# endif
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+ tfar = ssef(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
+ idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
new file mode 100644
index 00000000000..b4f334eb842
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -0,0 +1,374 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility)
+{
+ /* TODO(sergey):
+ * - Test if pushing distance on the stack helps.
+ * - Likely and unlikely for if() statements.
+ * - Test restrict attribute for pointers.
+ */
+
+ /* Traversal stack in CUDA thread-local memory. */
+ QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* Ray parameters in registers. */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return false;
+ }
+#endif
+
+ isect->t = ray->t;
+ isect->u = 0.0f;
+ isect->v = 0.0f;
+ isect->prim = PRIM_NONE;
+ isect->object = OBJECT_NONE;
+
+ ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+ sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+#ifdef __VISIBILITY_FLAG__
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ if((__float_as_uint(inodes.x) & visibility) == 0) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+#endif
+
+ ssef dist;
+ int child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+
+ if(child_mask != 0) {
+ float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ float d0 = ((float*)&dist)[r];
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ }
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+ const uint p_type = type & PRIMITIVE_ALL;
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ switch(p_type) {
+ case PRIMITIVE_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* Only primitives from volume object. */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ /* Intersect ray against primitive. */
+ triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr);
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* Only primitives from volume object. */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ /* Intersect ray against primitive. */
+ motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+ }
+ break;
+ }
+#endif
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* Instance push. */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+ int object_flag = kernel_tex_fetch(__object_flag, object);
+
+ if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+# else
+ bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+# endif
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+ tfar = ssef(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
+ idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+ }
+ else {
+ /* Pop. */
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* Instance pop. */
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+# endif
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+ tfar = ssef(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
+ idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
new file mode 100644
index 00000000000..a877e5bb341
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -0,0 +1,448 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect_array,
+ const uint max_hits,
+ const uint visibility)
+{
+ /* TODO(sergey):
+ * - Test if pushing distance on the stack helps.
+ * - Likely and unlikely for if() statements.
+ * - Test restrict attribute for pointers.
+ */
+
+ /* Traversal stack in CUDA thread-local memory. */
+ QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* Ray parameters in registers. */
+ const float tmax = ray->t;
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+ float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+ uint num_hits = 0;
+ isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return false;
+ }
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ int num_hits_in_instance = 0;
+#endif
+
+ ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+ sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+ IsectPrecalc isect_precalc;
+ triangle_intersect_precalc(dir, &isect_precalc);
+
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+#ifdef __VISIBILITY_FLAG__
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ if((__float_as_uint(inodes.x) & visibility) == 0) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+#endif
+
+ ssef dist;
+ int child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+
+ if(child_mask != 0) {
+ float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ float d0 = ((float*)&dist)[r];
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ qbvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ }
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+ const uint p_type = type & PRIMITIVE_ALL;
+ bool hit;
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ switch(p_type) {
+ case PRIMITIVE_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* Only primitives from volume object. */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ /* Intersect ray against primitive. */
+ hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr);
+ if(hit) {
+ /* Update number of hits now, so we do proper check on max bounces. */
+ num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+#endif
+ if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+# if BVH_FEATURE(BVH_MOTION)
+ float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+# else
+ Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+ float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+# endif
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+#endif /* BVH_FEATURE(BVH_INSTANCING) */
+ return num_hits;
+ }
+ /* Move on to next entry in intersections array */
+ isect_array++;
+ isect_array->t = isect_t;
+ }
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* Only primitives from volume object. */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ /* Intersect ray against primitive. */
+ hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+ if(hit) {
+ /* Update number of hits now, so we do proper check on max bounces. */
+ num_hits++;
+# if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+# endif
+ if(num_hits == max_hits) {
+# if BVH_FEATURE(BVH_INSTANCING)
+# if BVH_FEATURE(BVH_MOTION)
+ float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+# else
+ Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+ float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+# endif
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+# endif /* BVH_FEATURE(BVH_INSTANCING) */
+ return num_hits;
+ }
+ /* Move on to next entry in intersections array */
+ isect_array++;
+ isect_array->t = isect_t;
+ }
+ }
+ break;
+ }
+#endif
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* Instance push. */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+ int object_flag = kernel_tex_fetch(__object_flag, object);
+
+ if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+# else
+ bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+# endif
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+ tfar = ssef(isect_t);
+ idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+ num_hits_in_instance = 0;
+ isect_array->t = isect_t;
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+ }
+ else {
+ /* Pop. */
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* Instance pop. */
+ if(num_hits_in_instance) {
+ float t_fac;
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+# else
+ bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+# endif
+ triangle_intersect_precalc(dir, &isect_precalc);
+ /* Scale isect->t to adjust for instancing. */
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+ }
+ else {
+ float ignore_t = FLT_MAX;
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+# endif
+ triangle_intersect_precalc(dir, &isect_precalc);
+ }
+
+ if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+ if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+ if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+ tfar = ssef(isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
+ idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
+ triangle_intersect_precalc(dir, &isect_precalc);
+ isect_t = tmax;
+ isect_array->t = isect_t;
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return num_hits;
+}
+
+#undef NODE_INTERSECT