diff options
author | Thomas Dinges <blender@dingto.org> | 2014-06-13 23:13:18 +0400 |
---|---|---|
committer | Thomas Dinges <blender@dingto.org> | 2014-06-13 23:59:12 +0400 |
commit | cd5e1ff74e4f6443f3e4b836dd23fe46b56cb7ed (patch) | |
tree | 578ee132eab87d348147e49c91e1929660558c20 /intern/cycles/kernel/geom/geom_bvh_subsurface.h | |
parent | d0573ce9054e325c0ad2fbb943087e0f8b9e159a (diff) |
Cycles Refactor: Add SSE Utility code from Embree for cleaner SSE code.
This makes the code a bit easier to understand, and might come in handy
if we want to reuse more Embree code.
Differential Revision: https://developer.blender.org/D482
Code by Brecht, with fixes by Lockal, Sergey and myself.
Diffstat (limited to 'intern/cycles/kernel/geom/geom_bvh_subsurface.h')
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_subsurface.h | 54 |
1 files changed, 27 insertions, 27 deletions
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h index a19f05dd371..a8f57cffa78 100644 --- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h +++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h @@ -65,15 +65,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); - __m128 Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; shuffle_swap_t shufflexyz[3]; - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -131,25 +131,27 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ /* fetch node data */ - const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; const float4 cnodes = ((float4*)bvh_nodes)[3]; /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn); - const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); + /* calculate { c0min, c1min, -c0max, -c1max} */ + const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); /* decide which nodes to traverse next */ #ifdef __VISIBILITY_FLAG__ /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); + traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); #else - traverseChild0 = (_mm_movemask_ps(lrhit) & 1); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2); + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); #endif #endif // __KERNEL_SSE2__ @@ -161,9 +163,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio #if !defined(__KERNEL_SSE2__) bool closestChild1 = (c1min < c0min); #else - union { __m128 m128; float v[4]; } uminmax; - uminmax.m128 = tminmax; - bool closestChild1 = uminmax.v[1] < uminmax.v[0]; + bool closestChild1 = tminmax[1] < tminmax[0]; #endif if(closestChild1) { @@ -243,11 +243,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -279,11 +279,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif |