Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2021-02-14 17:01:26 +0300
committerBrecht Van Lommel <brecht@blender.org>2021-02-17 18:26:24 +0300
commit0e9497e886924cb75ca67f2c14e2fdda29f2b583 (patch)
tree45f8afa824786756b3402b2036caf7f92a2cbbf6 /intern/cycles/util/util_ssef.h
parent68dd7617d705dd255b29b99074afa107ce38031e (diff)
Cycles: add support for Arm Neon instructions using sse2neon
Based on patch contributed by Apple and Stefan Werner. Ref D8237, T78710
Diffstat (limited to 'intern/cycles/util/util_ssef.h')
-rw-r--r--intern/cycles/util/util_ssef.h116
1 files changed, 96 insertions, 20 deletions
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index b14640ced40..d039b50a7d2 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -303,41 +303,46 @@ __forceinline ssef maxi(const ssef &a, const ssef &b)
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
-# if defined(__KERNEL_AVX2__)
__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
{
+# if defined(__KERNEL_NEON__)
+ return vfmaq_f32(c, a, b);
+# elif defined(__KERNEL_AVX2__)
return _mm_fmadd_ps(a, b, c);
-}
-__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
-{
- return _mm_fmsub_ps(a, b, c);
-}
-__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
-{
- return _mm_fnmadd_ps(a, b, c);
-}
-__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
-{
- return _mm_fnmsub_ps(a, b, c);
-}
# else
-__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
-{
return a * b + c;
+# endif
}
__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
{
+# if defined(__KERNEL_NEON__)
+ return vfmaq_f32(vnegq_f32(c), a, b);
+# elif defined(__KERNEL_AVX2__)
+ return _mm_fmsub_ps(a, b, c);
+# else
return a * b - c;
+# endif
}
__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
{
+# if defined(__KERNEL_NEON__)
+ return vfmsq_f32(c, a, b);
+# elif defined(__KERNEL_AVX2__)
+ return _mm_fnmadd_ps(a, b, c);
+# else
return c - a * b;
+# endif
}
__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
{
+# if defined(__KERNEL_NEON__)
+ return vfmsq_f32(vnegq_f32(c), a, b);
+# elif defined(__KERNEL_AVX2__)
+ return _mm_fnmsub_ps(a, b, c);
+# else
return -a * b - c;
-}
# endif
+}
////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators
@@ -496,27 +501,51 @@ __forceinline const ssef select(const int mask, const ssef &t, const ssef &f)
# if defined(__KERNEL_SSE41__)
__forceinline const ssef round_even(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndnq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
+# endif
}
__forceinline const ssef round_down(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndmq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
+# endif
}
__forceinline const ssef round_up(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndpq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
+# endif
}
__forceinline const ssef round_zero(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_ZERO);
+# endif
}
__forceinline const ssef floor(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndnq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
+# endif
}
__forceinline const ssef ceil(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndpq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
+# endif
}
# endif
@@ -566,7 +595,11 @@ __forceinline ssef unpackhi(const ssef &a, const ssef &b)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssef shuffle(const ssef &b)
{
+# ifdef __KERNEL_NEON__
+ return shuffle_neon<ssef, i0, i1, i2, i3>(b.m128);
+# else
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+# endif
}
template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a)
@@ -582,14 +615,23 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssef shuffle(const ssef &a, const ssef &b)
{
+# ifdef __KERNEL_NEON__
+ return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b);
+# else
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+# endif
}
template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
{
+# ifdef __KERNEL_NEON__
+ return shuffle<float32x4_t, i0, i0, i0, i0>(a, b);
+# else
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
+# endif
}
+# ifndef __KERNEL_NEON__
template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b)
{
return _mm_movelh_ps(a, b);
@@ -599,6 +641,7 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const sse
{
return _mm_movehl_ps(b, a);
}
+# endif
# if defined(__KERNEL_SSSE3__)
__forceinline const ssef shuffle8(const ssef &a, const ssei &shuf)
@@ -643,7 +686,16 @@ template<> __forceinline float extract<0>(const ssef &a)
template<size_t dst, size_t src, size_t clr>
__forceinline const ssef insert(const ssef &a, const ssef &b)
{
+# ifdef __KERNEL_NEON__
+ ssef res = a;
+ if (clr)
+ res[dst] = 0;
+ else
+ res[dst] = b[src];
+ return res;
+# else
return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
+# endif
}
template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b)
{
@@ -703,31 +755,55 @@ __forceinline void transpose(
__forceinline const ssef vreduce_min(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vdupq_n_f32(vminvq_f32(v));
+# else
ssef h = min(shuffle<1, 0, 3, 2>(v), v);
return min(shuffle<2, 3, 0, 1>(h), h);
+# endif
}
__forceinline const ssef vreduce_max(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vdupq_n_f32(vmaxvq_f32(v));
+# else
ssef h = max(shuffle<1, 0, 3, 2>(v), v);
return max(shuffle<2, 3, 0, 1>(h), h);
+# endif
}
__forceinline const ssef vreduce_add(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vdupq_n_f32(vaddvq_f32(v));
+# else
ssef h = shuffle<1, 0, 3, 2>(v) + v;
return shuffle<2, 3, 0, 1>(h) + h;
+# endif
}
__forceinline float reduce_min(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vminvq_f32(v);
+# else
return _mm_cvtss_f32(vreduce_min(v));
+# endif
}
__forceinline float reduce_max(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vmaxvq_f32(v);
+# else
return _mm_cvtss_f32(vreduce_max(v));
+# endif
}
__forceinline float reduce_add(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vaddvq_f32(v);
+# else
return _mm_cvtss_f32(vreduce_add(v));
+# endif
}
__forceinline uint32_t select_min(const ssef &v)
@@ -942,14 +1018,14 @@ ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf)
{
/* shuffle value must be a constant, so we need to branch */
if (shuf)
- return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(1, 0, 3, 2)));
+ return shuffle<1, 0, 3, 2>(a);
else
- return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(3, 2, 1, 0)));
+ return shuffle<3, 2, 1, 0>(a);
}
# endif
-# ifdef __KERNEL_SSE41__
+# if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__)
ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
const shuffle_swap_t &shuf_identity,