Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2021-02-14 17:01:26 +0300
committerBrecht Van Lommel <brecht@blender.org>2021-02-17 18:26:24 +0300
commit0e9497e886924cb75ca67f2c14e2fdda29f2b583 (patch)
tree45f8afa824786756b3402b2036caf7f92a2cbbf6 /intern
parent68dd7617d705dd255b29b99074afa107ce38031e (diff)
Cycles: add support for Arm Neon instructions using sse2neon
Based on patch contributed by Apple and Stefan Werner. Ref D8237, T78710
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/graph/node_type.cpp2
-rw-r--r--intern/cycles/render/camera.cpp3
-rw-r--r--intern/cycles/render/nodes.cpp10
-rw-r--r--intern/cycles/render/nodes.h1
-rw-r--r--intern/cycles/util/util_math_float3.h10
-rw-r--r--intern/cycles/util/util_math_float4.h37
-rw-r--r--intern/cycles/util/util_optimization.h26
-rw-r--r--intern/cycles/util/util_simd.h83
-rw-r--r--intern/cycles/util/util_sseb.h49
-rw-r--r--intern/cycles/util/util_ssef.h116
-rw-r--r--intern/cycles/util/util_ssei.h20
11 files changed, 317 insertions, 40 deletions
diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp
index 2b11af70d71..d1eadf21b1b 100644
--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@@ -102,7 +102,7 @@ size_t SocketType::max_size()
void *SocketType::zero_default_value()
{
- static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+ static Transform zero_transform = transform_zero();
return &zero_transform;
}
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 1f932135a57..afe788eb4be 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -741,7 +741,8 @@ float Camera::world_to_raster_size(float3 P)
float3 D = transform_point(&worldtocamera, P);
float dist = len(D);
- Ray ray = {{0}};
+ Ray ray;
+ memset(&ray, 0, sizeof(ray));
/* Distortion can become so great that the results become meaningless, there
* may be a better way to do this, but calculating differentials from the
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 84286c9b1a3..b17f1ec0b2f 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -2081,6 +2081,16 @@ ConvertNode::ConvertNode(SocketType::Type from_, SocketType::Type to_, bool auto
special_type = SHADER_SPECIAL_TYPE_AUTOCONVERT;
}
+/* Union usage requires a manual copy constructor. */
+ConvertNode::ConvertNode(const ConvertNode &other)
+ : ShaderNode(other),
+ from(other.from),
+ to(other.to),
+ value_color(other.value_color),
+ value_string(other.value_string)
+{
+}
+
void ConvertNode::constant_fold(const ConstantFolder &folder)
{
/* proxy nodes should have been removed at this point */
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index d4603143ef4..fb9cf0c9836 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -501,6 +501,7 @@ class RGBToBWNode : public ShaderNode {
class ConvertNode : public ShaderNode {
public:
ConvertNode(SocketType::Type from, SocketType::Type to, bool autoconvert = false);
+ ConvertNode(const ConvertNode &other);
SHADER_NODE_BASE_CLASS(ConvertNode)
void constant_fold(const ConstantFolder &folder);
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 162bc900d9f..67c5c61e4c0 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -304,8 +304,12 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
ccl_device_inline float3 fabs(const float3 &a)
{
# ifdef __KERNEL_SSE__
+# ifdef __KERNEL_NEON__
+ return float3(vabsq_f32(a.m128));
+# else
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return float3(_mm_and_ps(a.m128, mask));
+# endif
# else
return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
# endif
@@ -447,7 +451,13 @@ ccl_device_inline bool is_zero(const float3 a)
ccl_device_inline float reduce_add(const float3 a)
{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
+ __m128 t = a.m128;
+ t[3] = 0.0f;
+ return vaddvq_f32(t);
+#else
return (a.x + a.y + a.z);
+#endif
}
ccl_device_inline float average(const float3 a)
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 38fdd9e3146..0ba2bafa2f0 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -257,7 +257,12 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
ccl_device_inline float dot(const float4 &a, const float4 &b)
{
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ __m128 t = vmulq_f32(a, b);
+ return vaddvq_f32(t);
+# else
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+# endif
# else
return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
# endif
@@ -313,8 +318,10 @@ ccl_device_inline bool is_zero(const float4 &a)
ccl_device_inline float4 reduce_add(const float4 &a)
{
-# ifdef __KERNEL_SSE__
-# ifdef __KERNEL_SSE3__
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vdupq_n_f32(vaddvq_f32(a)));
+# elif defined(__KERNEL_SSE3__)
float4 h(_mm_hadd_ps(a.m128, a.m128));
return float4(_mm_hadd_ps(h.m128, h.m128));
# else
@@ -373,8 +380,12 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
ccl_device_inline float4 fabs(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vabsq_f32(a));
+# else
return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+# endif
# else
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
# endif
@@ -400,14 +411,22 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &b)
{
+# if defined(__KERNEL_NEON__)
+ return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
+# else
return float4(_mm_castsi128_ps(
_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+# endif
}
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &a, const float4 &b)
{
+# if defined(__KERNEL_NEON__)
+ return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
+# else
return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+# endif
}
template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
@@ -457,9 +476,13 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
ccl_device_inline float4 reduce_min(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vdupq_n_f32(vminvq_f32(a)));
+# else
float4 h = min(shuffle<1, 0, 3, 2>(a), a);
return min(shuffle<2, 3, 0, 1>(h), h);
+# endif
# else
return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
# endif
@@ -467,9 +490,13 @@ ccl_device_inline float4 reduce_min(const float4 &a)
ccl_device_inline float4 reduce_max(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vdupq_n_f32(vmaxvq_f32(a)));
+# else
float4 h = max(shuffle<1, 0, 3, 2>(a), a);
return max(shuffle<2, 3, 0, 1>(h), h);
+# endif
# else
return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
# endif
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 46dd883282a..7ecd3893cf4 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -27,44 +27,50 @@
/* We require minimum SSE2 support on x86, so auto enable. */
# define __KERNEL_SSE2__
-
# ifdef WITH_KERNEL_SSE2
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# endif
-
# ifdef WITH_KERNEL_SSE3
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# endif
-# endif /* defined(i386) || defined(_M_IX86) */
-
/* x86-64
*
* Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */
-# if defined(__x86_64__) || defined(_M_X64)
+# elif defined(__x86_64__) || defined(_M_X64)
/* SSE2 is always available on x86-64 CPUs, so auto enable */
# define __KERNEL_SSE2__
-
/* no SSE2 kernel on x86-64, part of regular kernel */
# ifdef WITH_KERNEL_SSE3
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# endif
-
# ifdef WITH_KERNEL_SSE41
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# endif
-
# ifdef WITH_KERNEL_AVX
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# endif
-
# ifdef WITH_KERNEL_AVX2
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# endif
-# endif /* defined(__x86_64__) || defined(_M_X64) */
+/* Arm Neon
+ *
+ * Compile a SSE4 kernel emulated with Neon. Most code is shared with
+ * SSE, some specializations for performance and compatibility are made
+ * made testing for __KERNEL_NEON__. */
+
+# elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
+
+# define __KERNEL_NEON__
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSE41__
+
+# endif
#endif
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 3a6761c6a2f..c51c3c957e0 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -35,6 +35,9 @@
# include <intrin.h>
#elif (defined(__x86_64__) || defined(__i386__))
# include <x86intrin.h>
+#elif defined(__KERNEL_NEON__)
+# define SSE2NEON_PRECISE_MINMAX 1
+# include <sse2neon.h>
#endif
/* Floating Point Control, for Embree. */
@@ -116,6 +119,80 @@ static struct StepTy {
#endif
+/* Utilities used by Neon */
+#if defined(__KERNEL_NEON__)
+template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a)
+{
+ if (i0 == i1 && i0 == i2 && i0 == i3) {
+ return vdupq_laneq_s32(a, i0);
+ }
+ static const uint8_t tbl[16] = {(i0 * 4) + 0,
+ (i0 * 4) + 1,
+ (i0 * 4) + 2,
+ (i0 * 4) + 3,
+ (i1 * 4) + 0,
+ (i1 * 4) + 1,
+ (i1 * 4) + 2,
+ (i1 * 4) + 3,
+ (i2 * 4) + 0,
+ (i2 * 4) + 1,
+ (i2 * 4) + 2,
+ (i2 * 4) + 3,
+ (i3 * 4) + 0,
+ (i3 * 4) + 1,
+ (i3 * 4) + 2,
+ (i3 * 4) + 3};
+
+ return vqtbl1q_s8(int8x16_t(a), *(int8x16_t *)tbl);
+}
+
+template<class type, int i0, int i1, int i2, int i3>
+type shuffle_neon(const type &a, const type &b)
+{
+ if (&a == &b) {
+ static const uint8_t tbl[16] = {(i0 * 4) + 0,
+ (i0 * 4) + 1,
+ (i0 * 4) + 2,
+ (i0 * 4) + 3,
+ (i1 * 4) + 0,
+ (i1 * 4) + 1,
+ (i1 * 4) + 2,
+ (i1 * 4) + 3,
+ (i2 * 4) + 0,
+ (i2 * 4) + 1,
+ (i2 * 4) + 2,
+ (i2 * 4) + 3,
+ (i3 * 4) + 0,
+ (i3 * 4) + 1,
+ (i3 * 4) + 2,
+ (i3 * 4) + 3};
+
+ return vqtbl1q_s8(int8x16_t(b), *(int8x16_t *)tbl);
+ }
+ else {
+
+ static const uint8_t tbl[16] = {(i0 * 4) + 0,
+ (i0 * 4) + 1,
+ (i0 * 4) + 2,
+ (i0 * 4) + 3,
+ (i1 * 4) + 0,
+ (i1 * 4) + 1,
+ (i1 * 4) + 2,
+ (i1 * 4) + 3,
+ (i2 * 4) + 0 + 16,
+ (i2 * 4) + 1 + 16,
+ (i2 * 4) + 2 + 16,
+ (i2 * 4) + 3 + 16,
+ (i3 * 4) + 0 + 16,
+ (i3 * 4) + 1 + 16,
+ (i3 * 4) + 2 + 16,
+ (i3 * 4) + 3 + 16};
+
+ return vqtbl2q_s8((int8x16x2_t){a, b}, *(int8x16_t *)tbl);
+ }
+}
+#endif /* __KERNEL_NEON */
+
/* Intrinsics Functions
*
* For fast bit operations. */
@@ -428,8 +505,9 @@ __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
}
-# undef _mm_extract_epi32
-# define _mm_extract_epi32 _mm_extract_epi32_emu
+# ifndef __KERNEL_NEON__
+# undef _mm_extract_epi32
+# define _mm_extract_epi32 _mm_extract_epi32_emu
__forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
{
switch (index) {
@@ -446,6 +524,7 @@ __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
return 0;
}
}
+# endif
# undef _mm_insert_epi32
# define _mm_insert_epi32 _mm_insert_epi32_emu
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index edf13e0c493..1488da46b09 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -197,9 +197,14 @@ __forceinline const sseb unpackhi(const sseb &a, const sseb &b)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const sseb shuffle(const sseb &a)
{
+# ifdef __KERNEL_NEON__
+ return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a);
+# else
return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
+# endif
}
+# ifndef __KERNEL_NEON__
template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a)
{
return _mm_movelh_ps(a, a);
@@ -209,13 +214,19 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a)
{
return _mm_movehl_ps(a, a);
}
+# endif
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const sseb shuffle(const sseb &a, const sseb &b)
{
+# ifdef __KERNEL_NEON__
+ return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b);
+# else
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+# endif
}
+# ifndef __KERNEL_NEON__
template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b)
{
return _mm_movelh_ps(a, b);
@@ -225,8 +236,9 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sse
{
return _mm_movehl_ps(b, a);
}
+# endif
-# if defined(__KERNEL_SSE3__)
+# if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__)
template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a)
{
return _mm_moveldup_ps(a);
@@ -241,7 +253,16 @@ template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a)
template<size_t dst, size_t src, size_t clr>
__forceinline const sseb insert(const sseb &a, const sseb &b)
{
+# ifdef __KERNEL_NEON__
+ sseb res = a;
+ if (clr)
+ res[dst] = 0;
+ else
+ res[dst] = b[src];
+ return res;
+# else
return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
+# endif
}
template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b)
{
@@ -260,7 +281,13 @@ template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b
# if defined(__KERNEL_SSE41__)
__forceinline uint32_t popcnt(const sseb &a)
{
+# if defined(__KERNEL_NEON__)
+ const int32x4_t mask = {1, 1, 1, 1};
+ int32x4_t t = vandq_s32(a.m128, mask);
+ return vaddvq_s32(t);
+# else
return _mm_popcnt_u32(_mm_movemask_ps(a));
+# endif
}
# else
__forceinline uint32_t popcnt(const sseb &a)
@@ -271,23 +298,43 @@ __forceinline uint32_t popcnt(const sseb &a)
__forceinline bool reduce_and(const sseb &a)
{
+# if defined(__KERNEL_NEON__)
+ return vaddvq_s32(a.m128) == -4;
+# else
return _mm_movemask_ps(a) == 0xf;
+# endif
}
__forceinline bool reduce_or(const sseb &a)
{
+# if defined(__KERNEL_NEON__)
+ return vaddvq_s32(a.m128) != 0x0;
+# else
return _mm_movemask_ps(a) != 0x0;
+# endif
}
__forceinline bool all(const sseb &b)
{
+# if defined(__KERNEL_NEON__)
+ return vaddvq_s32(b.m128) == -4;
+# else
return _mm_movemask_ps(b) == 0xf;
+# endif
}
__forceinline bool any(const sseb &b)
{
+# if defined(__KERNEL_NEON__)
+ return vaddvq_s32(b.m128) != 0x0;
+# else
return _mm_movemask_ps(b) != 0x0;
+# endif
}
__forceinline bool none(const sseb &b)
{
+# if defined(__KERNEL_NEON__)
+ return vaddvq_s32(b.m128) == 0x0;
+# else
return _mm_movemask_ps(b) == 0x0;
+# endif
}
__forceinline uint32_t movemask(const sseb &a)
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index b14640ced40..d039b50a7d2 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -303,41 +303,46 @@ __forceinline ssef maxi(const ssef &a, const ssef &b)
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
-# if defined(__KERNEL_AVX2__)
__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
{
+# if defined(__KERNEL_NEON__)
+ return vfmaq_f32(c, a, b);
+# elif defined(__KERNEL_AVX2__)
return _mm_fmadd_ps(a, b, c);
-}
-__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
-{
- return _mm_fmsub_ps(a, b, c);
-}
-__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
-{
- return _mm_fnmadd_ps(a, b, c);
-}
-__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
-{
- return _mm_fnmsub_ps(a, b, c);
-}
# else
-__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
-{
return a * b + c;
+# endif
}
__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
{
+# if defined(__KERNEL_NEON__)
+ return vfmaq_f32(vnegq_f32(c), a, b);
+# elif defined(__KERNEL_AVX2__)
+ return _mm_fmsub_ps(a, b, c);
+# else
return a * b - c;
+# endif
}
__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
{
+# if defined(__KERNEL_NEON__)
+ return vfmsq_f32(c, a, b);
+# elif defined(__KERNEL_AVX2__)
+ return _mm_fnmadd_ps(a, b, c);
+# else
return c - a * b;
+# endif
}
__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
{
+# if defined(__KERNEL_NEON__)
+ return vfmsq_f32(vnegq_f32(c), a, b);
+# elif defined(__KERNEL_AVX2__)
+ return _mm_fnmsub_ps(a, b, c);
+# else
return -a * b - c;
-}
# endif
+}
////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators
@@ -496,27 +501,51 @@ __forceinline const ssef select(const int mask, const ssef &t, const ssef &f)
# if defined(__KERNEL_SSE41__)
__forceinline const ssef round_even(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndnq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
+# endif
}
__forceinline const ssef round_down(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndmq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
+# endif
}
__forceinline const ssef round_up(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndpq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
+# endif
}
__forceinline const ssef round_zero(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_ZERO);
+# endif
}
__forceinline const ssef floor(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndnq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
+# endif
}
__forceinline const ssef ceil(const ssef &a)
{
+# ifdef __KERNEL_NEON__
+ return vrndpq_f32(a);
+# else
return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
+# endif
}
# endif
@@ -566,7 +595,11 @@ __forceinline ssef unpackhi(const ssef &a, const ssef &b)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssef shuffle(const ssef &b)
{
+# ifdef __KERNEL_NEON__
+ return shuffle_neon<ssef, i0, i1, i2, i3>(b.m128);
+# else
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+# endif
}
template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a)
@@ -582,14 +615,23 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssef shuffle(const ssef &a, const ssef &b)
{
+# ifdef __KERNEL_NEON__
+ return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b);
+# else
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+# endif
}
template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
{
+# ifdef __KERNEL_NEON__
+ return shuffle<float32x4_t, i0, i0, i0, i0>(a, b);
+# else
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
+# endif
}
+# ifndef __KERNEL_NEON__
template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b)
{
return _mm_movelh_ps(a, b);
@@ -599,6 +641,7 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const sse
{
return _mm_movehl_ps(b, a);
}
+# endif
# if defined(__KERNEL_SSSE3__)
__forceinline const ssef shuffle8(const ssef &a, const ssei &shuf)
@@ -643,7 +686,16 @@ template<> __forceinline float extract<0>(const ssef &a)
template<size_t dst, size_t src, size_t clr>
__forceinline const ssef insert(const ssef &a, const ssef &b)
{
+# ifdef __KERNEL_NEON__
+ ssef res = a;
+ if (clr)
+ res[dst] = 0;
+ else
+ res[dst] = b[src];
+ return res;
+# else
return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
+# endif
}
template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b)
{
@@ -703,31 +755,55 @@ __forceinline void transpose(
__forceinline const ssef vreduce_min(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vdupq_n_f32(vminvq_f32(v));
+# else
ssef h = min(shuffle<1, 0, 3, 2>(v), v);
return min(shuffle<2, 3, 0, 1>(h), h);
+# endif
}
__forceinline const ssef vreduce_max(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vdupq_n_f32(vmaxvq_f32(v));
+# else
ssef h = max(shuffle<1, 0, 3, 2>(v), v);
return max(shuffle<2, 3, 0, 1>(h), h);
+# endif
}
__forceinline const ssef vreduce_add(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vdupq_n_f32(vaddvq_f32(v));
+# else
ssef h = shuffle<1, 0, 3, 2>(v) + v;
return shuffle<2, 3, 0, 1>(h) + h;
+# endif
}
__forceinline float reduce_min(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vminvq_f32(v);
+# else
return _mm_cvtss_f32(vreduce_min(v));
+# endif
}
__forceinline float reduce_max(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vmaxvq_f32(v);
+# else
return _mm_cvtss_f32(vreduce_max(v));
+# endif
}
__forceinline float reduce_add(const ssef &v)
{
+# ifdef __KERNEL_NEON__
+ return vaddvq_f32(v);
+# else
return _mm_cvtss_f32(vreduce_add(v));
+# endif
}
__forceinline uint32_t select_min(const ssef &v)
@@ -942,14 +1018,14 @@ ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf)
{
/* shuffle value must be a constant, so we need to branch */
if (shuf)
- return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(1, 0, 3, 2)));
+ return shuffle<1, 0, 3, 2>(a);
else
- return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(3, 2, 1, 0)));
+ return shuffle<3, 2, 1, 0>(a);
}
# endif
-# ifdef __KERNEL_SSE41__
+# if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__)
ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
const shuffle_swap_t &shuf_identity,
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index c03ab18a6df..3ec69ab3700 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -445,14 +445,22 @@ __forceinline ssei unpackhi(const ssei &a, const ssei &b)
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssei shuffle(const ssei &a)
{
+# ifdef __KERNEL_NEON__
+ return shuffle_neon<ssei, i0, i1, i2, i3>(a);
+# else
return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
+# endif
}
template<size_t i0, size_t i1, size_t i2, size_t i3>
__forceinline const ssei shuffle(const ssei &a, const ssei &b)
{
+# ifdef __KERNEL_NEON__
+ return shuffle_neon<ssei, i0, i1, i2, i3>(a, b);
+# else
return _mm_castps_si128(
_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+# endif
}
template<size_t i0> __forceinline const ssei shuffle(const ssei &b)
@@ -505,15 +513,27 @@ __forceinline const ssei vreduce_add(const ssei &v)
__forceinline int reduce_min(const ssei &v)
{
+# ifdef __KERNEL_NEON__
+ return vminvq_s32(v);
+# else
return extract<0>(vreduce_min(v));
+# endif
}
__forceinline int reduce_max(const ssei &v)
{
+# ifdef __KERNEL_NEON__
+ return vmaxvq_s32(v);
+# else
return extract<0>(vreduce_max(v));
+# endif
}
__forceinline int reduce_add(const ssei &v)
{
+# ifdef __KERNEL_NEON__
+ return vaddvq_s32(v);
+# else
return extract<0>(vreduce_add(v));
+# endif
}
__forceinline uint32_t select_min(const ssei &v)