Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@pandora.be>2013-06-18 13:36:06 +0400
committerBrecht Van Lommel <brechtvanlommel@pandora.be>2013-06-18 13:36:06 +0400
commitd57c6748c4ebb37246caf25d4900ef6d5c16c0fe (patch)
tree08491bec3d7310f7df1e2171c8fb44a68d508a90 /intern
parent9131adca9f748f794c18c71d36f830a961c218b4 (diff)
Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.
On the BMW scene, this gives roughly a 10% speedup overall with clang/gcc, and 30% speedup with visual studio (2008). It turns out visual studio was optimizing the existing code quite poorly compared to pretty good autovectorization by clang/gcc, but hand written SSE code also gives a smaller speed boost there. This code isn't enabled when using the hair minimum width feature yet, need to make that work with the SSE code still.
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/bvh/bvh.cpp6
-rw-r--r--intern/cycles/kernel/kernel_bvh.h116
-rw-r--r--intern/cycles/kernel/kernel_bvh_traversal.h218
-rw-r--r--intern/cycles/kernel/kernel_sse3.cpp2
-rw-r--r--intern/cycles/util/util_types.h57
5 files changed, 275 insertions, 124 deletions
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 82a444bda76..69ccf2588c9 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -552,9 +552,9 @@ void RegularBVH::pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int
{
int4 data[BVH_NODE_SIZE] =
{
- make_int4(__float_as_int(b0.min.x), __float_as_int(b0.max.x), __float_as_int(b0.min.y), __float_as_int(b0.max.y)),
- make_int4(__float_as_int(b1.min.x), __float_as_int(b1.max.x), __float_as_int(b1.min.y), __float_as_int(b1.max.y)),
- make_int4(__float_as_int(b0.min.z), __float_as_int(b0.max.z), __float_as_int(b1.min.z), __float_as_int(b1.max.z)),
+ make_int4(__float_as_int(b0.min.x), __float_as_int(b1.min.x), __float_as_int(b0.max.x), __float_as_int(b1.max.x)),
+ make_int4(__float_as_int(b0.min.y), __float_as_int(b1.min.y), __float_as_int(b0.max.y), __float_as_int(b1.max.y)),
+ make_int4(__float_as_int(b0.min.z), __float_as_int(b1.min.z), __float_as_int(b0.max.z), __float_as_int(b1.max.z)),
make_int4(c0, c1, visibility0, visibility1)
};
diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h
index ae9677ed5cb..9f6d79e13fb 100644
--- a/intern/cycles/kernel/kernel_bvh.h
+++ b/intern/cycles/kernel/kernel_bvh.h
@@ -112,80 +112,8 @@ __device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, cons
}
#endif
-/* intersect two bounding boxes */
-#ifdef __HAIR__
-__device_inline void bvh_node_intersect(KernelGlobals *kg,
- bool *traverseChild0, bool *traverseChild1,
- bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
- float3 P, float3 idir, float t, uint visibility, int nodeAddr, float difl, float extmax)
-{
-#else
-__device_inline void bvh_node_intersect(KernelGlobals *kg,
- bool *traverseChild0, bool *traverseChild1,
- bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
- float3 P, float3 idir, float t, uint visibility, int nodeAddr)
-{
-#endif
-
- /* fetch node data */
- float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
- float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
- float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
- /* intersect ray against child nodes */
- float3 ood = P * idir;
- NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x;
- NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x;
- NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y;
- NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y;
- NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z;
- NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z;
- NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
- NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z;
- NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z;
- NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x;
- NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x;
- NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y;
- NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y;
- NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-#ifdef __HAIR__
- if(difl != 0.0f) {
- float hdiff = 1.0f + difl;
- float ldiff = 1.0f - difl;
- if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
- c0min = max(ldiff * c0min, c0min - extmax);
- c0max = min(hdiff * c0max, c0max + extmax);
- }
- if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
- c1min = max(ldiff * c1min, c1min - extmax);
- c1max = min(hdiff * c1max, c1max + extmax);
- }
- }
-#endif
-
- /* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
- /* this visibility test gives a 5% performance hit, how to solve? */
- *traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
- *traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#else
- *traverseChild0 = (c0max >= c0min);
- *traverseChild1 = (c1max >= c1min);
-#endif
-
- *nodeAddr0 = __float_as_int(cnodes.x);
- *nodeAddr1 = __float_as_int(cnodes.y);
-
- *closestChild1 = (c1min < c0min);
-}
-
/* Sven Woop's algorithm */
-__device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int triAddr)
{
/* compute and check intersection t-value */
@@ -223,10 +151,13 @@ __device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *ise
isect->u = u;
isect->v = v;
isect->t = t;
+ return true;
}
}
}
}
+
+ return false;
}
#ifdef __HAIR__
@@ -280,7 +211,7 @@ __device_inline void curvebounds(float *lower, float *upper, float *extremta, fl
}
}
-__device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
{
float epsilon = 0.0f;
@@ -346,7 +277,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
float zextrem[4];
curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
if(lower - r_curr > isect->t || upper + r_curr < epsilon)
- return;
+ return false;
/*minimum width extension*/
float mw_extension = min(difl * fabsf(upper), extmax);
@@ -355,17 +286,18 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
float xextrem[4];
curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
if(lower > r_ext || upper < -r_ext)
- return;
+ return false;
float yextrem[4];
curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
if(lower > r_ext || upper < -r_ext)
- return;
+ return false;
/*setup recurrent loop*/
int level = 1 << depth;
int tree = 0;
float resol = 1.0f / (float)level;
+ bool hit = false;
/*begin loop*/
while(!(tree >> (depth))) {
@@ -557,7 +489,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
/*stochastic fade from minimum width*/
if(lcg_state && coverage != 1.0f) {
if(lcg_step(lcg_state) > coverage)
- return;
+ return hit;
}
#ifdef __VISIBILITY_FLAG__
@@ -574,6 +506,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
isect->v = 0.0f;
/*isect->v = 1.0f - coverage; */
isect->t = t;
+ hit = true;
}
tree++;
@@ -584,9 +517,11 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
level = level >> 1;
}
}
+
+ return hit;
}
-__device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
{
/* curve Intersection check */
@@ -630,7 +565,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
sphere_b = dot(dir,sphere_dif);
float sdisc = sphere_b * sphere_b - len_squared(sphere_dif) + sp_r * sp_r;
if(sdisc < 0.0f)
- return;
+ return false;
/* obtain parameters and test midpoint distance for suitable modes*/
float3 tg = (p2 - p1) / l;
@@ -645,9 +580,9 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float zcentre = difz + (dirz * tcentre);
if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
- return;
+ return false;
if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
- return;
+ return false;
/* test minimum separation*/
float3 cprod = cross(tg, dir);
@@ -662,7 +597,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
distscaled = (distscaled*distscaled)/cprodsq;
if(distscaled > mr*mr)
- return;
+ return false;
/* calculate true intersection*/
float3 tdif = P - p1 + tcentre * dir;
@@ -672,7 +607,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float td = tb*tb - 4*a*tc;
if (td < 0.0f)
- return;
+ return false;
float rootd = 0.0f;
float correction = 0.0f;
@@ -706,7 +641,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
adjradius = adjradius / (r1 + z * gd);
if(lcg_state && adjradius != 1.0f) {
if(lcg_step(lcg_state) > adjradius)
- return;
+ return false;
}
/* --- */
@@ -719,7 +654,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
float c2 = dot(dif,dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
if(a2*c2 < 0.0f)
- return;
+ return false;
}
}
@@ -740,9 +675,13 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
if(backface)
isect->u = -isect->u;
+
+ return true;
}
}
}
+
+ return false;
}
#endif
@@ -751,7 +690,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
* only want to intersect with primitives in the same object, and if case of
* multiple hits we pick a single random primitive as the intersection point. */
-__device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
+__device_inline bool bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect,
float3 P, float3 idir, int object, int triAddr, float tmax, int *num_hits, float subsurface_random)
{
/* compute and check intersection t-value */
@@ -786,10 +725,13 @@ __device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Inters
isect->u = u;
isect->v = v;
isect->t = t;
+ return true;
}
}
}
}
+
+ return false;
}
#endif
diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h
index 2d75af32abd..9fd466a6731 100644
--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ b/intern/cycles/kernel/kernel_bvh_traversal.h
@@ -1,6 +1,8 @@
/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -41,6 +43,14 @@ __device bool BVH_FUNCTION_NAME
#endif
)
{
+ /* todo:
+ * - test if pushing distance on the stack helps (for non shadow rays)
+ * - separate version for shadow rays
+ * - likely and unlikely for if() statements
+ * - SSE for hair
+ * - test restrict attribute for pointers
+ */
+
/* traversal stack in CUDA thread-local memory */
int traversalStack[BVH_STACK_SIZE];
traversalStack[0] = ENTRYPOINT_SENTINEL;
@@ -70,6 +80,28 @@ __device bool BVH_FUNCTION_NAME
isect->u = 0.0f;
isect->v = 0.0f;
+#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ const __m128i shuffle_identity = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ const __m128i shuffle_swap = _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+
+ const __m128i pn = _mm_set_epi32(0x80000000, 0x80000000, 0x00000000, 0x00000000);
+ __m128 Psplat[3], idirsplat[3];
+
+ Psplat[0] = _mm_set_ps1(P.x);
+ Psplat[1] = _mm_set_ps1(P.y);
+ Psplat[2] = _mm_set_ps1(P.z);
+
+ idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
+ idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
+ idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+
+ __m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+
+ __m128i shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap;
+ __m128i shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap;
+ __m128i shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap;
+#endif
+
/* traversal loop */
do {
do
@@ -77,46 +109,121 @@ __device bool BVH_FUNCTION_NAME
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
{
- bool traverseChild0, traverseChild1, closestChild1;
+ bool traverseChild0, traverseChild1;
int nodeAddrChild1;
+ float t = isect->t;
+
+#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ /* Intersect two child bounding boxes, non-SSE version */
+
+ /* fetch node data */
+ float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+ float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+ float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+ /* intersect ray against child nodes */
+ NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+ NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+ NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+ NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+ NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+ NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+ NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+ NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+ NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+ NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+ NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+ NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+ NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+ NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+ NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+ NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) && !FEATURE(BVH_SUBSURFACE)
- bvh_node_intersect(kg, &traverseChild0, &traverseChild1,
- &closestChild1, &nodeAddr, &nodeAddrChild1,
- P, idir, isect->t, visibility, nodeAddr, difl, extmax);
+ if(difl != 0.0f) {
+ float hdiff = 1.0f + difl;
+ float ldiff = 1.0f - difl;
+ if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+ c0min = max(ldiff * c0min, c0min - extmax);
+ c0max = min(hdiff * c0max, c0max + extmax);
+ }
+ if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+ c1min = max(ldiff * c1min, c1min - extmax);
+ c1max = min(hdiff * c1max, c1max + extmax);
+ }
+ }
+#endif
+
+ /* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
+ traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
#else
- bvh_node_intersect(kg, &traverseChild0, &traverseChild1,
- &closestChild1, &nodeAddr, &nodeAddrChild1,
-#ifdef __HAIR__
- P, idir, isect->t, visibility, nodeAddr, 0.0f, 0.0f);
+ traverseChild0 = (c0max >= c0min);
+ traverseChild1 = (c1max >= c1min);
+#endif
+
+#else // __KERNEL_SSE3__
+ /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+ /* fetch node data */
+ __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+ float4 cnodes = ((float4*)bvh_nodes)[3];
+
+ /* intersect ray against child nodes */
+ const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]);
+ const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]);
+ const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]);
+
+ const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), _mm_castsi128_ps(pn));
+ const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle8(tminmax, shuffle_swap));
+
+ /* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+ traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
#else
- P, idir, isect->t, visibility, nodeAddr);
+ traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
+ traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
#endif
+#endif // __KERNEL_SSE3__
+
+ nodeAddr = __float_as_int(cnodes.x);
+ nodeAddrChild1 = __float_as_int(cnodes.y);
+
+ if(traverseChild0 && traverseChild1) {
+ /* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ bool closestChild1 = (c1min < c0min);
+#else
+ union { __m128 m128; float v[4]; } uminmax;
+ uminmax.m128 = tminmax;
+ bool closestChild1 = uminmax.v[1] < uminmax.v[0];
#endif
- if(traverseChild0 != traverseChild1) {
- /* one child was intersected */
- if(traverseChild1) {
+ if(closestChild1) {
+ int tmp = nodeAddr;
nodeAddr = nodeAddrChild1;
+ nodeAddrChild1 = tmp;
}
+
+ ++stackPtr;
+ traversalStack[stackPtr] = nodeAddrChild1;
}
else {
- if(!traverseChild0) {
+ /* one child was intersected */
+ if(traverseChild1) {
+ nodeAddr = nodeAddrChild1;
+ }
+ else if(!traverseChild0) {
/* neither child was intersected */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
- else {
- /* both children were intersected, push the farther one */
- if(closestChild1) {
- int tmp = nodeAddr;
- nodeAddr = nodeAddrChild1;
- nodeAddrChild1 = tmp;
- }
-
- ++stackPtr;
- traversalStack[stackPtr] = nodeAddrChild1;
- }
}
}
@@ -136,6 +243,7 @@ __device bool BVH_FUNCTION_NAME
/* primitive intersection */
while(primAddr < primAddr2) {
+ bool hit;
#if FEATURE(BVH_SUBSURFACE)
/* only primitives from the same object */
uint tri_object = (object == ~0)? kernel_tex_fetch(__prim_object, primAddr): object;
@@ -148,15 +256,16 @@ __device bool BVH_FUNCTION_NAME
uint segment = kernel_tex_fetch(__prim_segment, primAddr);
#if !FEATURE(BVH_SUBSURFACE)
if(segment != ~0) {
+
if(kernel_data.curve_kernel_data.curveflags & CURVE_KN_INTERPOLATE)
#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
- bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
+ hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
else
- bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
+ hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
#else
- bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
+ hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
else
- bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
+ hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
#endif
}
else
@@ -166,17 +275,27 @@ __device bool BVH_FUNCTION_NAME
#if FEATURE(BVH_HAIR)
if(segment == ~0)
#endif
- bvh_triangle_intersect_subsurface(kg, isect, P, idir, object, primAddr, tmax, &num_hits, subsurface_random);
+ hit = bvh_triangle_intersect_subsurface(kg, isect, P, idir, object, primAddr, tmax, &num_hits, subsurface_random);
}
#else
- bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
+ hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
/* shadow ray early termination */
- if(visibility == PATH_RAY_SHADOW_OPAQUE && isect->prim != ~0)
+#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ if(hit) {
+ if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ return true;
+
+ tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+ }
+#else
+ if(hit && visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
#endif
+#endif
+
primAddr++;
}
}
@@ -196,6 +315,22 @@ __device bool BVH_FUNCTION_NAME
bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
#endif
+#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ Psplat[0] = _mm_set_ps1(P.x);
+ Psplat[1] = _mm_set_ps1(P.y);
+ Psplat[2] = _mm_set_ps1(P.z);
+
+ idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
+ idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
+ idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+
+ tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+
+ shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap;
+ shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap;
+ shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap;
+#endif
+
++stackPtr;
traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
@@ -223,6 +358,23 @@ __device bool BVH_FUNCTION_NAME
#else
bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
#endif
+
+#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ Psplat[0] = _mm_set_ps1(P.x);
+ Psplat[1] = _mm_set_ps1(P.y);
+ Psplat[2] = _mm_set_ps1(P.z);
+
+ idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
+ idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
+ idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+
+ tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+
+ shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap;
+ shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap;
+ shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap;
+#endif
+
object = ~0;
nodeAddr = traversalStack[stackPtr];
--stackPtr;
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
index 6982570c59b..ccd3ee5ac74 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -22,6 +22,8 @@
#ifdef WITH_OPTIMIZED_KERNEL
+#define __KERNEL_SSE3__
+
#include "kernel.h"
#include "kernel_compat_cpu.h"
#include "kernel_math.h"
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 472a707d8fd..d4ff95b0663 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -72,13 +72,21 @@
#include <tmmintrin.h> /* SSE 3 */
#include <smmintrin.h> /* SSE 4 */
+#ifndef __KERNEL_SSE2__
#define __KERNEL_SSE2__
+#endif
+
+#ifndef __KERNEL_SSE3__
#define __KERNEL_SSE3__
+#endif
+
+#ifndef __KERNEL_SSE4__
#define __KERNEL_SSE4__
+#endif
#else
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__KERNEL_SSE3__)
/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
* Since we can't avoid including <windows.h>, better only include that */
@@ -87,9 +95,16 @@
#else
#include <xmmintrin.h> /* SSE 1 */
#include <emmintrin.h> /* SSE 2 */
+
+#ifdef __KERNEL_SSE3__
+#include <pmmintrin.h> /* SSE 3 */
+#include <tmmintrin.h> /* SSE 3 */
+#endif
#endif
+#ifndef __KERNEL_SSE2__
#define __KERNEL_SSE2__
+#endif
#endif
@@ -471,6 +486,46 @@ __device_inline int4 make_int4(const float3& f)
#endif
+#ifdef __KERNEL_SSE3__
+
+/* SSE shuffle utility functions */
+
+__device_inline const __m128 shuffle8(const __m128& a, const __m128i& shuf)
+{
+ return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& a, const __m128& b)
+{
+ return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& b)
+{
+ return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+#endif
+
+#if defined(__KERNEL_SSE2__) && defined(_MSC_VER)
+
+/* count zeros from start or end of integer bits */
+
+__device_inline uint32_t __builtin_ctz(uint32_t i)
+{
+ unsigned long r = 0;
+ _BitScanForward(&r, i);
+ return (uint32_t)r;
+}
+
+__device_inline uint32_t __builtin_clz(uint32_t i)
+{
+ unsigned long r = 0;
+ _BitScanReverse(&r, i);
+ return (uint32_t)r;
+}
+
+#endif
+
CCL_NAMESPACE_END
#endif /* __UTIL_TYPES_H__ */