Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorSv. Lockal <lockalsash@gmail.com>2014-02-03 20:46:13 +0400
committerSv. Lockal <lockalsash@gmail.com>2014-02-03 20:49:07 +0400
commitcb96cf0b637360a84d86f4f375f28ccc79a53294 (patch)
tree7e8d67250f4c74d2328b4d997f3463761c1d8310 /intern
parenteff3bd4e9880efc0c767e3098baa86ab23d77bc7 (diff)
Cycles: small optimization for SSE 4.1 bvh intersector
Gives 0.7% - 1.3% speedup for BMW1M-MikePan scene. Reviewers: juicyfruit Differential Revision: https://developer.blender.org/D280
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/kernel_bvh_subsurface.h31
-rw-r--r--intern/cycles/kernel/kernel_bvh_traversal.h31
-rw-r--r--intern/cycles/util/util_simd.h30
3 files changed, 44 insertions, 48 deletions
diff --git a/intern/cycles/kernel/kernel_bvh_subsurface.h b/intern/cycles/kernel/kernel_bvh_subsurface.h
index bb51986b4f4..df82dda2435 100644
--- a/intern/cycles/kernel/kernel_bvh_subsurface.h
+++ b/intern/cycles/kernel/kernel_bvh_subsurface.h
@@ -66,20 +66,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
__m128 Psplat[3], idirsplat[3];
+ shuffle_swap_t shufflexyz[3];
Psplat[0] = _mm_set_ps1(P.x);
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
- idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
- idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
- idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
__m128 tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
- shuffle_swap_t shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
- shuffle_swap_t shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
- shuffle_swap_t shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
/* traversal loop */
@@ -139,9 +134,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
- const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]);
- const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]);
- const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]);
+ const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
+ const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
+ const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
@@ -242,15 +237,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
- idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
- idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
- idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
- shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
- shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
- shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
++stackPtr;
@@ -285,15 +274,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
- idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
- idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
- idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
- shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
- shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
- shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
object = ~0;
diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h
index 1ee1fbc3cb4..b4c63f5682c 100644
--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ b/intern/cycles/kernel/kernel_bvh_traversal.h
@@ -75,20 +75,15 @@ ccl_device bool BVH_FUNCTION_NAME
const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
__m128 Psplat[3], idirsplat[3];
+ shuffle_swap_t shufflexyz[3];
Psplat[0] = _mm_set_ps1(P.x);
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
- idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
- idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
- idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
__m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
- shuffle_swap_t shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
- shuffle_swap_t shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
- shuffle_swap_t shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
/* traversal loop */
@@ -163,9 +158,9 @@ ccl_device bool BVH_FUNCTION_NAME
float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
- const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflex), Psplat[0]), idirsplat[0]);
- const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shuffley), Psplat[1]), idirsplat[1]);
- const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflez), Psplat[2]), idirsplat[2]);
+ const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
+ const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
+ const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
@@ -286,15 +281,9 @@ ccl_device bool BVH_FUNCTION_NAME
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
- idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
- idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
- idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
- shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
- shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
- shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
++stackPtr;
@@ -322,15 +311,9 @@ ccl_device bool BVH_FUNCTION_NAME
Psplat[1] = _mm_set_ps1(P.y);
Psplat[2] = _mm_set_ps1(P.z);
- idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
- idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
- idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
- shufflex = (idir.x >= 0)? shuf_identity: shuf_swap;
- shuffley = (idir.y >= 0)? shuf_identity: shuf_swap;
- shufflez = (idir.z >= 0)? shuf_identity: shuf_swap;
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
object = ~0;
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index ac4e38ec1b8..2d3a927f227 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -69,6 +69,36 @@ ccl_device_inline const __m128 shuffle_swap(const __m128& a, shuffle_swap_t shuf
#endif
+#ifdef __KERNEL_SSE41__
+ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
+ const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
+{
+ const __m128 idirsplat_raw[] = { _mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z) };
+ idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn);
+ idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn);
+ idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn);
+
+ const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+ const __m128 shuf_identity_f = _mm_castsi128_ps(shuf_identity);
+ const __m128 shuf_swap_f = _mm_castsi128_ps(shuf_swap);
+ shufflexyz[0] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask)));
+ shufflexyz[1] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask)));
+ shufflexyz[2] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask)));
+}
+#else
+ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
+ const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
+{
+ idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
+ idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
+ idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
+
+ shufflexyz[0] = (idir.x >= 0)? shuf_identity: shuf_swap;
+ shufflexyz[1] = (idir.y >= 0)? shuf_identity: shuf_swap;
+ shufflexyz[2] = (idir.z >= 0)? shuf_identity: shuf_swap;
+}
+#endif
+
template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128 shuffle(const __m128& a, const __m128& b)
{
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));