From cb96cf0b637360a84d86f4f375f28ccc79a53294 Mon Sep 17 00:00:00 2001 From: "Sv. Lockal" Date: Mon, 3 Feb 2014 20:46:13 +0400 Subject: Cycles: small optimization for SSE 4.1 bvh intersector Gives 0.7% - 1.3% speedup for BMW1M-MikePan scene. Reviewers: juicyfruit Differential Revision: https://developer.blender.org/D280 --- intern/cycles/kernel/kernel_bvh_subsurface.h | 31 +++++++--------------------- intern/cycles/kernel/kernel_bvh_traversal.h | 31 +++++++--------------------- intern/cycles/util/util_simd.h | 30 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 48 deletions(-) (limited to 'intern') diff --git a/intern/cycles/kernel/kernel_bvh_subsurface.h b/intern/cycles/kernel/kernel_bvh_subsurface.h index bb51986b4f4..df82dda2435 100644 --- a/intern/cycles/kernel/kernel_bvh_subsurface.h +++ b/intern/cycles/kernel/kernel_bvh_subsurface.h @@ -66,20 +66,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); __m128 Psplat[3], idirsplat[3]; + shuffle_swap_t shufflexyz[3]; Psplat[0] = _mm_set_ps1(P.x); Psplat[1] = _mm_set_ps1(P.y); Psplat[2] = _mm_set_ps1(P.z); - idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn); - idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn); - idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn); - __m128 tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f); - shuffle_swap_t shufflex = (idir.x >= 0)? shuf_identity: shuf_swap; - shuffle_swap_t shuffley = (idir.y >= 0)? shuf_identity: shuf_swap; - shuffle_swap_t shufflez = (idir.z >= 0)? shuf_identity: shuf_swap; + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif /* traversal loop */ @@ -139,9 +134,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio float4 cnodes = ((float4*)bvh_nodes)[3]; /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]); + const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); + const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); + const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn); const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); @@ -242,15 +237,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio Psplat[1] = _mm_set_ps1(P.y); Psplat[2] = _mm_set_ps1(P.z); - idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn); - idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn); - idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn); - tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f); - shufflex = (idir.x >= 0)? shuf_identity: shuf_swap; - shuffley = (idir.y >= 0)? shuf_identity: shuf_swap; - shufflez = (idir.z >= 0)? shuf_identity: shuf_swap; + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif ++stackPtr; @@ -285,15 +274,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio Psplat[1] = _mm_set_ps1(P.y); Psplat[2] = _mm_set_ps1(P.z); - idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn); - idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn); - idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn); - tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f); - shufflex = (idir.x >= 0)? shuf_identity: shuf_swap; - shuffley = (idir.y >= 0)? shuf_identity: shuf_swap; - shufflez = (idir.z >= 0)? shuf_identity: shuf_swap; + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif object = ~0; diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h index 1ee1fbc3cb4..b4c63f5682c 100644 --- a/intern/cycles/kernel/kernel_bvh_traversal.h +++ b/intern/cycles/kernel/kernel_bvh_traversal.h @@ -75,20 +75,15 @@ ccl_device bool BVH_FUNCTION_NAME const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); __m128 Psplat[3], idirsplat[3]; + shuffle_swap_t shufflexyz[3]; Psplat[0] = _mm_set_ps1(P.x); Psplat[1] = _mm_set_ps1(P.y); Psplat[2] = _mm_set_ps1(P.z); - idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn); - idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn); - idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn); - __m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); - shuffle_swap_t shufflex = (idir.x >= 0)? shuf_identity: shuf_swap; - shuffle_swap_t shuffley = (idir.y >= 0)? shuf_identity: shuf_swap; - shuffle_swap_t shufflez = (idir.z >= 0)? shuf_identity: shuf_swap; + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif /* traversal loop */ @@ -163,9 +158,9 @@ ccl_device bool BVH_FUNCTION_NAME float4 cnodes = ((float4*)bvh_nodes)[3]; /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]); + const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); + const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); + const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn); const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); @@ -286,15 +281,9 @@ ccl_device bool BVH_FUNCTION_NAME Psplat[1] = _mm_set_ps1(P.y); Psplat[2] = _mm_set_ps1(P.z); - idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn); - idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn); - idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn); - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); - shufflex = (idir.x >= 0)? shuf_identity: shuf_swap; - shuffley = (idir.y >= 0)? shuf_identity: shuf_swap; - shufflez = (idir.z >= 0)? shuf_identity: shuf_swap; + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif ++stackPtr; @@ -322,15 +311,9 @@ ccl_device bool BVH_FUNCTION_NAME Psplat[1] = _mm_set_ps1(P.y); Psplat[2] = _mm_set_ps1(P.z); - idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn); - idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn); - idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn); - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); - shufflex = (idir.x >= 0)? shuf_identity: shuf_swap; - shuffley = (idir.y >= 0)? shuf_identity: shuf_swap; - shufflez = (idir.z >= 0)? shuf_identity: shuf_swap; + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif object = ~0; diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index ac4e38ec1b8..2d3a927f227 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -69,6 +69,36 @@ ccl_device_inline const __m128 shuffle_swap(const __m128& a, shuffle_swap_t shuf #endif +#ifdef __KERNEL_SSE41__ +ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap, + const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3]) +{ + const __m128 idirsplat_raw[] = { _mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z) }; + idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn); + idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn); + idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn); + + const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + const __m128 shuf_identity_f = _mm_castsi128_ps(shuf_identity); + const __m128 shuf_swap_f = _mm_castsi128_ps(shuf_swap); + shufflexyz[0] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask))); + shufflexyz[1] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask))); + shufflexyz[2] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask))); +} +#else +ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap, + const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3]) +{ + idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn); + idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn); + idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn); + + shufflexyz[0] = (idir.x >= 0)? shuf_identity: shuf_swap; + shufflexyz[1] = (idir.y >= 0)? shuf_identity: shuf_swap; + shufflexyz[2] = (idir.z >= 0)? shuf_identity: shuf_swap; +} +#endif + template ccl_device_inline const __m128 shuffle(const __m128& a, const __m128& b) { return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); -- cgit v1.2.3