diff options
author | Brecht Van Lommel <brechtvanlommel@pandora.be> | 2013-06-18 13:36:06 +0400 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@pandora.be> | 2013-06-18 13:36:06 +0400 |
commit | d57c6748c4ebb37246caf25d4900ef6d5c16c0fe (patch) | |
tree | 08491bec3d7310f7df1e2171c8fb44a68d508a90 /intern | |
parent | 9131adca9f748f794c18c71d36f830a961c218b4 (diff) |
Cycles: optimization for BVH traveral on CPU's with SSE3, using code from Embree.
On the BMW scene, this gives roughly a 10% speedup overall with clang/gcc, and 30%
speedup with visual studio (2008). It turns out visual studio was optimizing the
existing code quite poorly compared to pretty good autovectorization by clang/gcc,
but hand written SSE code also gives a smaller speed boost there.
This code isn't enabled when using the hair minimum width feature yet, need to
make that work with the SSE code still.
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/bvh/bvh.cpp | 6 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_bvh.h | 116 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_bvh_traversal.h | 218 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_sse3.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/util/util_types.h | 57 |
5 files changed, 275 insertions, 124 deletions
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index 82a444bda76..69ccf2588c9 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -552,9 +552,9 @@ void RegularBVH::pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int { int4 data[BVH_NODE_SIZE] = { - make_int4(__float_as_int(b0.min.x), __float_as_int(b0.max.x), __float_as_int(b0.min.y), __float_as_int(b0.max.y)), - make_int4(__float_as_int(b1.min.x), __float_as_int(b1.max.x), __float_as_int(b1.min.y), __float_as_int(b1.max.y)), - make_int4(__float_as_int(b0.min.z), __float_as_int(b0.max.z), __float_as_int(b1.min.z), __float_as_int(b1.max.z)), + make_int4(__float_as_int(b0.min.x), __float_as_int(b1.min.x), __float_as_int(b0.max.x), __float_as_int(b1.max.x)), + make_int4(__float_as_int(b0.min.y), __float_as_int(b1.min.y), __float_as_int(b0.max.y), __float_as_int(b1.max.y)), + make_int4(__float_as_int(b0.min.z), __float_as_int(b1.min.z), __float_as_int(b0.max.z), __float_as_int(b1.max.z)), make_int4(c0, c1, visibility0, visibility1) }; diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h index ae9677ed5cb..9f6d79e13fb 100644 --- a/intern/cycles/kernel/kernel_bvh.h +++ b/intern/cycles/kernel/kernel_bvh.h @@ -112,80 +112,8 @@ __device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, cons } #endif -/* intersect two bounding boxes */ -#ifdef __HAIR__ -__device_inline void bvh_node_intersect(KernelGlobals *kg, - bool *traverseChild0, bool *traverseChild1, - bool *closestChild1, int *nodeAddr0, int *nodeAddr1, - float3 P, float3 idir, float t, uint visibility, int nodeAddr, float difl, float extmax) -{ -#else -__device_inline void bvh_node_intersect(KernelGlobals *kg, - bool *traverseChild0, bool *traverseChild1, - bool *closestChild1, int *nodeAddr0, int *nodeAddr1, - float3 P, float3 idir, float t, uint visibility, int nodeAddr) -{ -#endif - - /* fetch node data */ - float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - float3 ood = P * idir; - NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x; - NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x; - NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y; - NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y; - NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z; - NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z; - NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z; - NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x; - NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x; - NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y; - NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - -#ifdef __HAIR__ - if(difl != 0.0f) { - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } - } -#endif - - /* decide which nodes to traverse next */ -#ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - *traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); - *traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); -#else - *traverseChild0 = (c0max >= c0min); - *traverseChild1 = (c1max >= c1min); -#endif - - *nodeAddr0 = __float_as_int(cnodes.x); - *nodeAddr1 = __float_as_int(cnodes.y); - - *closestChild1 = (c1min < c0min); -} - /* Sven Woop's algorithm */ -__device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect, +__device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect, float3 P, float3 idir, uint visibility, int object, int triAddr) { /* compute and check intersection t-value */ @@ -223,10 +151,13 @@ __device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *ise isect->u = u; isect->v = v; isect->t = t; + return true; } } } } + + return false; } #ifdef __HAIR__ @@ -280,7 +211,7 @@ __device_inline void curvebounds(float *lower, float *upper, float *extremta, fl } } -__device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, +__device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax) { float epsilon = 0.0f; @@ -346,7 +277,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio float zextrem[4]; curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); if(lower - r_curr > isect->t || upper + r_curr < epsilon) - return; + return false; /*minimum width extension*/ float mw_extension = min(difl * fabsf(upper), extmax); @@ -355,17 +286,18 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio float xextrem[4]; curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); if(lower > r_ext || upper < -r_ext) - return; + return false; float yextrem[4]; curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); if(lower > r_ext || upper < -r_ext) - return; + return false; /*setup recurrent loop*/ int level = 1 << depth; int tree = 0; float resol = 1.0f / (float)level; + bool hit = false; /*begin loop*/ while(!(tree >> (depth))) { @@ -557,7 +489,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio /*stochastic fade from minimum width*/ if(lcg_state && coverage != 1.0f) { if(lcg_step(lcg_state) > coverage) - return; + return hit; } #ifdef __VISIBILITY_FLAG__ @@ -574,6 +506,7 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio isect->v = 0.0f; /*isect->v = 1.0f - coverage; */ isect->t = t; + hit = true; } tree++; @@ -584,9 +517,11 @@ __device_inline void bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio level = level >> 1; } } + + return hit; } -__device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, +__device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax) { /* curve Intersection check */ @@ -630,7 +565,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, sphere_b = dot(dir,sphere_dif); float sdisc = sphere_b * sphere_b - len_squared(sphere_dif) + sp_r * sp_r; if(sdisc < 0.0f) - return; + return false; /* obtain parameters and test midpoint distance for suitable modes*/ float3 tg = (p2 - p1) / l; @@ -645,9 +580,9 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, float zcentre = difz + (dirz * tcentre); if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) - return; + return false; if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) - return; + return false; /* test minimum separation*/ float3 cprod = cross(tg, dir); @@ -662,7 +597,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, distscaled = (distscaled*distscaled)/cprodsq; if(distscaled > mr*mr) - return; + return false; /* calculate true intersection*/ float3 tdif = P - p1 + tcentre * dir; @@ -672,7 +607,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, float td = tb*tb - 4*a*tc; if (td < 0.0f) - return; + return false; float rootd = 0.0f; float correction = 0.0f; @@ -706,7 +641,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, adjradius = adjradius / (r1 + z * gd); if(lcg_state && adjradius != 1.0f) { if(lcg_step(lcg_state) > adjradius) - return; + return false; } /* --- */ @@ -719,7 +654,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); float c2 = dot(dif,dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; if(a2*c2 < 0.0f) - return; + return false; } } @@ -740,9 +675,13 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, if(backface) isect->u = -isect->u; + + return true; } } } + + return false; } #endif @@ -751,7 +690,7 @@ __device_inline void bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, * only want to intersect with primitives in the same object, and if case of * multiple hits we pick a single random primitive as the intersection point. */ -__device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect, +__device_inline bool bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect, float3 P, float3 idir, int object, int triAddr, float tmax, int *num_hits, float subsurface_random) { /* compute and check intersection t-value */ @@ -786,10 +725,13 @@ __device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Inters isect->u = u; isect->v = v; isect->t = t; + return true; } } } } + + return false; } #endif diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h index 2d75af32abd..9fd466a6731 100644 --- a/intern/cycles/kernel/kernel_bvh_traversal.h +++ b/intern/cycles/kernel/kernel_bvh_traversal.h @@ -1,6 +1,8 @@ /* - * Adapted from code Copyright 2009-2010 NVIDIA Corporation - * Modifications Copyright 2011, Blender Foundation. + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2013, Blender Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,6 +43,14 @@ __device bool BVH_FUNCTION_NAME #endif ) { + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - SSE for hair + * - test restrict attribute for pointers + */ + /* traversal stack in CUDA thread-local memory */ int traversalStack[BVH_STACK_SIZE]; traversalStack[0] = ENTRYPOINT_SENTINEL; @@ -70,6 +80,28 @@ __device bool BVH_FUNCTION_NAME isect->u = 0.0f; isect->v = 0.0f; +#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH) + const __m128i shuffle_identity = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m128i shuffle_swap = _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + + const __m128i pn = _mm_set_epi32(0x80000000, 0x80000000, 0x00000000, 0x00000000); + __m128 Psplat[3], idirsplat[3]; + + Psplat[0] = _mm_set_ps1(P.x); + Psplat[1] = _mm_set_ps1(P.y); + Psplat[2] = _mm_set_ps1(P.z); + + idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn)); + idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn)); + idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn)); + + __m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + + __m128i shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap; + __m128i shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap; + __m128i shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap; +#endif + /* traversal loop */ do { do @@ -77,46 +109,121 @@ __device bool BVH_FUNCTION_NAME /* traverse internal nodes */ while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1, closestChild1; + bool traverseChild0, traverseChild1; int nodeAddrChild1; + float t = isect->t; + +#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH) + /* Intersect two child bounding boxes, non-SSE version */ + + /* fetch node data */ + float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); + float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); + float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); + + /* intersect ray against child nodes */ + NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); #if FEATURE(BVH_HAIR_MINIMUM_WIDTH) && !FEATURE(BVH_SUBSURFACE) - bvh_node_intersect(kg, &traverseChild0, &traverseChild1, - &closestChild1, &nodeAddr, &nodeAddrChild1, - P, idir, isect->t, visibility, nodeAddr, difl, extmax); + if(difl != 0.0f) { + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } +#endif + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); #else - bvh_node_intersect(kg, &traverseChild0, &traverseChild1, - &closestChild1, &nodeAddr, &nodeAddrChild1, -#ifdef __HAIR__ - P, idir, isect->t, visibility, nodeAddr, 0.0f, 0.0f); + traverseChild0 = (c0max >= c0min); + traverseChild1 = (c1max >= c1min); +#endif + +#else // __KERNEL_SSE3__ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + + /* fetch node data */ + __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + float4 cnodes = ((float4*)bvh_nodes)[3]; + + /* intersect ray against child nodes */ + const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]); + const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]); + const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle8(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]); + + const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), _mm_castsi128_ps(pn)); + const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle8(tminmax, shuffle_swap)); + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); #else - P, idir, isect->t, visibility, nodeAddr); + traverseChild0 = (_mm_movemask_ps(lrhit) & 1); + traverseChild1 = (_mm_movemask_ps(lrhit) & 2); #endif +#endif // __KERNEL_SSE3__ + + nodeAddr = __float_as_int(cnodes.x); + nodeAddrChild1 = __float_as_int(cnodes.y); + + if(traverseChild0 && traverseChild1) { + /* both children were intersected, push the farther one */ +#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH) + bool closestChild1 = (c1min < c0min); +#else + union { __m128 m128; float v[4]; } uminmax; + uminmax.m128 = tminmax; + bool closestChild1 = uminmax.v[1] < uminmax.v[0]; #endif - if(traverseChild0 != traverseChild1) { - /* one child was intersected */ - if(traverseChild1) { + if(closestChild1) { + int tmp = nodeAddr; nodeAddr = nodeAddrChild1; + nodeAddrChild1 = tmp; } + + ++stackPtr; + traversalStack[stackPtr] = nodeAddrChild1; } else { - if(!traverseChild0) { + /* one child was intersected */ + if(traverseChild1) { + nodeAddr = nodeAddrChild1; + } + else if(!traverseChild0) { /* neither child was intersected */ nodeAddr = traversalStack[stackPtr]; --stackPtr; } - else { - /* both children were intersected, push the farther one */ - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; - } - - ++stackPtr; - traversalStack[stackPtr] = nodeAddrChild1; - } } } @@ -136,6 +243,7 @@ __device bool BVH_FUNCTION_NAME /* primitive intersection */ while(primAddr < primAddr2) { + bool hit; #if FEATURE(BVH_SUBSURFACE) /* only primitives from the same object */ uint tri_object = (object == ~0)? kernel_tex_fetch(__prim_object, primAddr): object; @@ -148,15 +256,16 @@ __device bool BVH_FUNCTION_NAME uint segment = kernel_tex_fetch(__prim_segment, primAddr); #if !FEATURE(BVH_SUBSURFACE) if(segment != ~0) { + if(kernel_data.curve_kernel_data.curveflags & CURVE_KN_INTERPOLATE) #if FEATURE(BVH_HAIR_MINIMUM_WIDTH) - bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax); + hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax); else - bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax); + hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax); #else - bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment); + hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment); else - bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment); + hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment); #endif } else @@ -166,17 +275,27 @@ __device bool BVH_FUNCTION_NAME #if FEATURE(BVH_HAIR) if(segment == ~0) #endif - bvh_triangle_intersect_subsurface(kg, isect, P, idir, object, primAddr, tmax, &num_hits, subsurface_random); + hit = bvh_triangle_intersect_subsurface(kg, isect, P, idir, object, primAddr, tmax, &num_hits, subsurface_random); } #else - bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr); + hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr); /* shadow ray early termination */ - if(visibility == PATH_RAY_SHADOW_OPAQUE && isect->prim != ~0) +#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH) + if(hit) { + if(visibility == PATH_RAY_SHADOW_OPAQUE) + return true; + + tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + } +#else + if(hit && visibility == PATH_RAY_SHADOW_OPAQUE) return true; #endif +#endif + primAddr++; } } @@ -196,6 +315,22 @@ __device bool BVH_FUNCTION_NAME bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax); #endif +#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH) + Psplat[0] = _mm_set_ps1(P.x); + Psplat[1] = _mm_set_ps1(P.y); + Psplat[2] = _mm_set_ps1(P.z); + + idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn)); + idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn)); + idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn)); + + tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + + shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap; + shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap; + shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap; +#endif + ++stackPtr; traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; @@ -223,6 +358,23 @@ __device bool BVH_FUNCTION_NAME #else bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax); #endif + +#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH) + Psplat[0] = _mm_set_ps1(P.x); + Psplat[1] = _mm_set_ps1(P.y); + Psplat[2] = _mm_set_ps1(P.z); + + idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn)); + idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn)); + idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn)); + + tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + + shufflex = (idir.x >= 0)? shuffle_identity: shuffle_swap; + shuffley = (idir.y >= 0)? shuffle_identity: shuffle_swap; + shufflez = (idir.z >= 0)? shuffle_identity: shuffle_swap; +#endif + object = ~0; nodeAddr = traversalStack[stackPtr]; --stackPtr; diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp index 6982570c59b..ccd3ee5ac74 100644 --- a/intern/cycles/kernel/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernel_sse3.cpp @@ -22,6 +22,8 @@ #ifdef WITH_OPTIMIZED_KERNEL +#define __KERNEL_SSE3__ + #include "kernel.h" #include "kernel_compat_cpu.h" #include "kernel_math.h" diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 472a707d8fd..d4ff95b0663 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -72,13 +72,21 @@ #include <tmmintrin.h> /* SSE 3 */ #include <smmintrin.h> /* SSE 4 */ +#ifndef __KERNEL_SSE2__ #define __KERNEL_SSE2__ +#endif + +#ifndef __KERNEL_SSE3__ #define __KERNEL_SSE3__ +#endif + +#ifndef __KERNEL_SSE4__ #define __KERNEL_SSE4__ +#endif #else -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__KERNEL_SSE3__) /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. * Since we can't avoid including <windows.h>, better only include that */ @@ -87,9 +95,16 @@ #else #include <xmmintrin.h> /* SSE 1 */ #include <emmintrin.h> /* SSE 2 */ + +#ifdef __KERNEL_SSE3__ +#include <pmmintrin.h> /* SSE 3 */ +#include <tmmintrin.h> /* SSE 3 */ +#endif #endif +#ifndef __KERNEL_SSE2__ #define __KERNEL_SSE2__ +#endif #endif @@ -471,6 +486,46 @@ __device_inline int4 make_int4(const float3& f) #endif +#ifdef __KERNEL_SSE3__ + +/* SSE shuffle utility functions */ + +__device_inline const __m128 shuffle8(const __m128& a, const __m128i& shuf) +{ + return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& a, const __m128& b) +{ + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m128 shuffle(const __m128& b) +{ + return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))); +} +#endif + +#if defined(__KERNEL_SSE2__) && defined(_MSC_VER) + +/* count zeros from start or end of integer bits */ + +__device_inline uint32_t __builtin_ctz(uint32_t i) +{ + unsigned long r = 0; + _BitScanForward(&r, i); + return (uint32_t)r; +} + +__device_inline uint32_t __builtin_clz(uint32_t i) +{ + unsigned long r = 0; + _BitScanReverse(&r, i); + return (uint32_t)r; +} + +#endif + CCL_NAMESPACE_END #endif /* __UTIL_TYPES_H__ */ |