diff options
author | Brecht Van Lommel <brecht@blender.org> | 2021-02-14 17:01:26 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2021-02-17 18:26:24 +0300 |
commit | 0e9497e886924cb75ca67f2c14e2fdda29f2b583 (patch) | |
tree | 45f8afa824786756b3402b2036caf7f92a2cbbf6 /intern | |
parent | 68dd7617d705dd255b29b99074afa107ce38031e (diff) |
Cycles: add support for Arm Neon instructions using sse2neon
Based on patch contributed by Apple and Stefan Werner.
Ref D8237, T78710
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/graph/node_type.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/render/camera.cpp | 3 | ||||
-rw-r--r-- | intern/cycles/render/nodes.cpp | 10 | ||||
-rw-r--r-- | intern/cycles/render/nodes.h | 1 | ||||
-rw-r--r-- | intern/cycles/util/util_math_float3.h | 10 | ||||
-rw-r--r-- | intern/cycles/util/util_math_float4.h | 37 | ||||
-rw-r--r-- | intern/cycles/util/util_optimization.h | 26 | ||||
-rw-r--r-- | intern/cycles/util/util_simd.h | 83 | ||||
-rw-r--r-- | intern/cycles/util/util_sseb.h | 49 | ||||
-rw-r--r-- | intern/cycles/util/util_ssef.h | 116 | ||||
-rw-r--r-- | intern/cycles/util/util_ssei.h | 20 |
11 files changed, 317 insertions, 40 deletions
diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp index 2b11af70d71..d1eadf21b1b 100644 --- a/intern/cycles/graph/node_type.cpp +++ b/intern/cycles/graph/node_type.cpp @@ -102,7 +102,7 @@ size_t SocketType::max_size() void *SocketType::zero_default_value() { - static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}; + static Transform zero_transform = transform_zero(); return &zero_transform; } diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp index 1f932135a57..afe788eb4be 100644 --- a/intern/cycles/render/camera.cpp +++ b/intern/cycles/render/camera.cpp @@ -741,7 +741,8 @@ float Camera::world_to_raster_size(float3 P) float3 D = transform_point(&worldtocamera, P); float dist = len(D); - Ray ray = {{0}}; + Ray ray; + memset(&ray, 0, sizeof(ray)); /* Distortion can become so great that the results become meaningless, there * may be a better way to do this, but calculating differentials from the diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index 84286c9b1a3..b17f1ec0b2f 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -2081,6 +2081,16 @@ ConvertNode::ConvertNode(SocketType::Type from_, SocketType::Type to_, bool auto special_type = SHADER_SPECIAL_TYPE_AUTOCONVERT; } +/* Union usage requires a manual copy constructor. */ +ConvertNode::ConvertNode(const ConvertNode &other) + : ShaderNode(other), + from(other.from), + to(other.to), + value_color(other.value_color), + value_string(other.value_string) +{ +} + void ConvertNode::constant_fold(const ConstantFolder &folder) { /* proxy nodes should have been removed at this point */ diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index d4603143ef4..fb9cf0c9836 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -501,6 +501,7 @@ class RGBToBWNode : public ShaderNode { class ConvertNode : public ShaderNode { public: ConvertNode(SocketType::Type from, SocketType::Type to, bool autoconvert = false); + ConvertNode(const ConvertNode &other); SHADER_NODE_BASE_CLASS(ConvertNode) void constant_fold(const ConstantFolder &folder); diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index 162bc900d9f..67c5c61e4c0 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -304,8 +304,12 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 & ccl_device_inline float3 fabs(const float3 &a) { # ifdef __KERNEL_SSE__ +# ifdef __KERNEL_NEON__ + return float3(vabsq_f32(a.m128)); +# else __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); return float3(_mm_and_ps(a.m128, mask)); +# endif # else return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); # endif @@ -447,7 +451,13 @@ ccl_device_inline bool is_zero(const float3 a) ccl_device_inline float reduce_add(const float3 a) { +#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__) + __m128 t = a.m128; + t[3] = 0.0f; + return vaddvq_f32(t); +#else return (a.x + a.y + a.z); +#endif } ccl_device_inline float average(const float3 a) diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index 38fdd9e3146..0ba2bafa2f0 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -257,7 +257,12 @@ ccl_device_inline float distance(const float4 &a, const float4 &b) ccl_device_inline float dot(const float4 &a, const float4 &b) { # if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + __m128 t = vmulq_f32(a, b); + return vaddvq_f32(t); +# else return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); +# endif # else return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w); # endif @@ -313,8 +318,10 @@ ccl_device_inline bool is_zero(const float4 &a) ccl_device_inline float4 reduce_add(const float4 &a) { -# ifdef __KERNEL_SSE__ -# ifdef __KERNEL_SSE3__ +# if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return float4(vdupq_n_f32(vaddvq_f32(a))); +# elif defined(__KERNEL_SSE3__) float4 h(_mm_hadd_ps(a.m128, a.m128)); return float4(_mm_hadd_ps(h.m128, h.m128)); # else @@ -373,8 +380,12 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 & ccl_device_inline float4 fabs(const float4 &a) { -# ifdef __KERNEL_SSE__ +# if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return float4(vabsq_f32(a)); +# else return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); +# endif # else return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); # endif @@ -400,14 +411,22 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4 &b) { +# if defined(__KERNEL_NEON__) + return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128)); +# else return float4(_mm_castsi128_ps( _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); +# endif } template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4 &a, const float4 &b) { +# if defined(__KERNEL_NEON__) + return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128)); +# else return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0))); +# endif } template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b) @@ -457,9 +476,13 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a) ccl_device_inline float4 reduce_min(const float4 &a) { -# ifdef __KERNEL_SSE__ +# if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return float4(vdupq_n_f32(vminvq_f32(a))); +# else float4 h = min(shuffle<1, 0, 3, 2>(a), a); return min(shuffle<2, 3, 0, 1>(h), h); +# endif # else return make_float4(min(min(a.x, a.y), min(a.z, a.w))); # endif @@ -467,9 +490,13 @@ ccl_device_inline float4 reduce_min(const float4 &a) ccl_device_inline float4 reduce_max(const float4 &a) { -# ifdef __KERNEL_SSE__ +# if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return float4(vdupq_n_f32(vmaxvq_f32(a))); +# else float4 h = max(shuffle<1, 0, 3, 2>(a), a); return max(shuffle<2, 3, 0, 1>(h), h); +# endif # else return make_float4(max(max(a.x, a.y), max(a.z, a.w))); # endif diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index 46dd883282a..7ecd3893cf4 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -27,44 +27,50 @@ /* We require minimum SSE2 support on x86, so auto enable. */ # define __KERNEL_SSE2__ - # ifdef WITH_KERNEL_SSE2 # define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 # endif - # ifdef WITH_KERNEL_SSE3 # define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 # endif -# endif /* defined(i386) || defined(_M_IX86) */ - /* x86-64 * * Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */ -# if defined(__x86_64__) || defined(_M_X64) +# elif defined(__x86_64__) || defined(_M_X64) /* SSE2 is always available on x86-64 CPUs, so auto enable */ # define __KERNEL_SSE2__ - /* no SSE2 kernel on x86-64, part of regular kernel */ # ifdef WITH_KERNEL_SSE3 # define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 # endif - # ifdef WITH_KERNEL_SSE41 # define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 # endif - # ifdef WITH_KERNEL_AVX # define WITH_CYCLES_OPTIMIZED_KERNEL_AVX # endif - # ifdef WITH_KERNEL_AVX2 # define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 # endif -# endif /* defined(__x86_64__) || defined(_M_X64) */ +/* Arm Neon + * + * Compile a SSE4 kernel emulated with Neon. Most code is shared with + * SSE, some specializations for performance and compatibility are made + * made testing for __KERNEL_NEON__. */ + +# elif defined(__ARM_NEON) && defined(WITH_SSE2NEON) + +# define __KERNEL_NEON__ +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSE41__ + +# endif #endif diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 3a6761c6a2f..c51c3c957e0 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -35,6 +35,9 @@ # include <intrin.h> #elif (defined(__x86_64__) || defined(__i386__)) # include <x86intrin.h> +#elif defined(__KERNEL_NEON__) +# define SSE2NEON_PRECISE_MINMAX 1 +# include <sse2neon.h> #endif /* Floating Point Control, for Embree. */ @@ -116,6 +119,80 @@ static struct StepTy { #endif +/* Utilities used by Neon */ +#if defined(__KERNEL_NEON__) +template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a) +{ + if (i0 == i1 && i0 == i2 && i0 == i3) { + return vdupq_laneq_s32(a, i0); + } + static const uint8_t tbl[16] = {(i0 * 4) + 0, + (i0 * 4) + 1, + (i0 * 4) + 2, + (i0 * 4) + 3, + (i1 * 4) + 0, + (i1 * 4) + 1, + (i1 * 4) + 2, + (i1 * 4) + 3, + (i2 * 4) + 0, + (i2 * 4) + 1, + (i2 * 4) + 2, + (i2 * 4) + 3, + (i3 * 4) + 0, + (i3 * 4) + 1, + (i3 * 4) + 2, + (i3 * 4) + 3}; + + return vqtbl1q_s8(int8x16_t(a), *(int8x16_t *)tbl); +} + +template<class type, int i0, int i1, int i2, int i3> +type shuffle_neon(const type &a, const type &b) +{ + if (&a == &b) { + static const uint8_t tbl[16] = {(i0 * 4) + 0, + (i0 * 4) + 1, + (i0 * 4) + 2, + (i0 * 4) + 3, + (i1 * 4) + 0, + (i1 * 4) + 1, + (i1 * 4) + 2, + (i1 * 4) + 3, + (i2 * 4) + 0, + (i2 * 4) + 1, + (i2 * 4) + 2, + (i2 * 4) + 3, + (i3 * 4) + 0, + (i3 * 4) + 1, + (i3 * 4) + 2, + (i3 * 4) + 3}; + + return vqtbl1q_s8(int8x16_t(b), *(int8x16_t *)tbl); + } + else { + + static const uint8_t tbl[16] = {(i0 * 4) + 0, + (i0 * 4) + 1, + (i0 * 4) + 2, + (i0 * 4) + 3, + (i1 * 4) + 0, + (i1 * 4) + 1, + (i1 * 4) + 2, + (i1 * 4) + 3, + (i2 * 4) + 0 + 16, + (i2 * 4) + 1 + 16, + (i2 * 4) + 2 + 16, + (i2 * 4) + 3 + 16, + (i3 * 4) + 0 + 16, + (i3 * 4) + 1 + 16, + (i3 * 4) + 2 + 16, + (i3 * 4) + 3 + 16}; + + return vqtbl2q_s8((int8x16x2_t){a, b}, *(int8x16_t *)tbl); + } +} +#endif /* __KERNEL_NEON */ + /* Intrinsics Functions * * For fast bit operations. */ @@ -428,8 +505,9 @@ __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input) return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); } -# undef _mm_extract_epi32 -# define _mm_extract_epi32 _mm_extract_epi32_emu +# ifndef __KERNEL_NEON__ +# undef _mm_extract_epi32 +# define _mm_extract_epi32 _mm_extract_epi32_emu __forceinline int _mm_extract_epi32_emu(__m128i input, const int index) { switch (index) { @@ -446,6 +524,7 @@ __forceinline int _mm_extract_epi32_emu(__m128i input, const int index) return 0; } } +# endif # undef _mm_insert_epi32 # define _mm_insert_epi32 _mm_insert_epi32_emu diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h index edf13e0c493..1488da46b09 100644 --- a/intern/cycles/util/util_sseb.h +++ b/intern/cycles/util/util_sseb.h @@ -197,9 +197,14 @@ __forceinline const sseb unpackhi(const sseb &a, const sseb &b) template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle(const sseb &a) { +# ifdef __KERNEL_NEON__ + return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a); +# else return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0))); +# endif } +# ifndef __KERNEL_NEON__ template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a) { return _mm_movelh_ps(a, a); @@ -209,13 +214,19 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a) { return _mm_movehl_ps(a, a); } +# endif template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle(const sseb &a, const sseb &b) { +# ifdef __KERNEL_NEON__ + return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b); +# else return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +# endif } +# ifndef __KERNEL_NEON__ template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b) { return _mm_movelh_ps(a, b); @@ -225,8 +236,9 @@ template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sse { return _mm_movehl_ps(b, a); } +# endif -# if defined(__KERNEL_SSE3__) +# if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__) template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a) { return _mm_moveldup_ps(a); @@ -241,7 +253,16 @@ template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a) template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert(const sseb &a, const sseb &b) { +# ifdef __KERNEL_NEON__ + sseb res = a; + if (clr) + res[dst] = 0; + else + res[dst] = b[src]; + return res; +# else return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); +# endif } template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b) { @@ -260,7 +281,13 @@ template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b # if defined(__KERNEL_SSE41__) __forceinline uint32_t popcnt(const sseb &a) { +# if defined(__KERNEL_NEON__) + const int32x4_t mask = {1, 1, 1, 1}; + int32x4_t t = vandq_s32(a.m128, mask); + return vaddvq_s32(t); +# else return _mm_popcnt_u32(_mm_movemask_ps(a)); +# endif } # else __forceinline uint32_t popcnt(const sseb &a) @@ -271,23 +298,43 @@ __forceinline uint32_t popcnt(const sseb &a) __forceinline bool reduce_and(const sseb &a) { +# if defined(__KERNEL_NEON__) + return vaddvq_s32(a.m128) == -4; +# else return _mm_movemask_ps(a) == 0xf; +# endif } __forceinline bool reduce_or(const sseb &a) { +# if defined(__KERNEL_NEON__) + return vaddvq_s32(a.m128) != 0x0; +# else return _mm_movemask_ps(a) != 0x0; +# endif } __forceinline bool all(const sseb &b) { +# if defined(__KERNEL_NEON__) + return vaddvq_s32(b.m128) == -4; +# else return _mm_movemask_ps(b) == 0xf; +# endif } __forceinline bool any(const sseb &b) { +# if defined(__KERNEL_NEON__) + return vaddvq_s32(b.m128) != 0x0; +# else return _mm_movemask_ps(b) != 0x0; +# endif } __forceinline bool none(const sseb &b) { +# if defined(__KERNEL_NEON__) + return vaddvq_s32(b.m128) == 0x0; +# else return _mm_movemask_ps(b) == 0x0; +# endif } __forceinline uint32_t movemask(const sseb &a) diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h index b14640ced40..d039b50a7d2 100644 --- a/intern/cycles/util/util_ssef.h +++ b/intern/cycles/util/util_ssef.h @@ -303,41 +303,46 @@ __forceinline ssef maxi(const ssef &a, const ssef &b) /// Ternary Operators //////////////////////////////////////////////////////////////////////////////// -# if defined(__KERNEL_AVX2__) __forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c) { +# if defined(__KERNEL_NEON__) + return vfmaq_f32(c, a, b); +# elif defined(__KERNEL_AVX2__) return _mm_fmadd_ps(a, b, c); -} -__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c) -{ - return _mm_fmsub_ps(a, b, c); -} -__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c) -{ - return _mm_fnmadd_ps(a, b, c); -} -__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c) -{ - return _mm_fnmsub_ps(a, b, c); -} # else -__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c) -{ return a * b + c; +# endif } __forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c) { +# if defined(__KERNEL_NEON__) + return vfmaq_f32(vnegq_f32(c), a, b); +# elif defined(__KERNEL_AVX2__) + return _mm_fmsub_ps(a, b, c); +# else return a * b - c; +# endif } __forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c) { +# if defined(__KERNEL_NEON__) + return vfmsq_f32(c, a, b); +# elif defined(__KERNEL_AVX2__) + return _mm_fnmadd_ps(a, b, c); +# else return c - a * b; +# endif } __forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c) { +# if defined(__KERNEL_NEON__) + return vfmsq_f32(vnegq_f32(c), a, b); +# elif defined(__KERNEL_AVX2__) + return _mm_fnmsub_ps(a, b, c); +# else return -a * b - c; -} # endif +} //////////////////////////////////////////////////////////////////////////////// /// Assignment Operators @@ -496,27 +501,51 @@ __forceinline const ssef select(const int mask, const ssef &t, const ssef &f) # if defined(__KERNEL_SSE41__) __forceinline const ssef round_even(const ssef &a) { +# ifdef __KERNEL_NEON__ + return vrndnq_f32(a); +# else return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); +# endif } __forceinline const ssef round_down(const ssef &a) { +# ifdef __KERNEL_NEON__ + return vrndmq_f32(a); +# else return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF); +# endif } __forceinline const ssef round_up(const ssef &a) { +# ifdef __KERNEL_NEON__ + return vrndpq_f32(a); +# else return _mm_round_ps(a, _MM_FROUND_TO_POS_INF); +# endif } __forceinline const ssef round_zero(const ssef &a) { +# ifdef __KERNEL_NEON__ + return vrndq_f32(a); +# else return _mm_round_ps(a, _MM_FROUND_TO_ZERO); +# endif } __forceinline const ssef floor(const ssef &a) { +# ifdef __KERNEL_NEON__ + return vrndnq_f32(a); +# else return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF); +# endif } __forceinline const ssef ceil(const ssef &a) { +# ifdef __KERNEL_NEON__ + return vrndpq_f32(a); +# else return _mm_round_ps(a, _MM_FROUND_TO_POS_INF); +# endif } # endif @@ -566,7 +595,11 @@ __forceinline ssef unpackhi(const ssef &a, const ssef &b) template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssef shuffle(const ssef &b) { +# ifdef __KERNEL_NEON__ + return shuffle_neon<ssef, i0, i1, i2, i3>(b.m128); +# else return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))); +# endif } template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a) @@ -582,14 +615,23 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a) template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssef shuffle(const ssef &a, const ssef &b) { +# ifdef __KERNEL_NEON__ + return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b); +# else return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +# endif } template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b) { +# ifdef __KERNEL_NEON__ + return shuffle<float32x4_t, i0, i0, i0, i0>(a, b); +# else return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)); +# endif } +# ifndef __KERNEL_NEON__ template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b) { return _mm_movelh_ps(a, b); @@ -599,6 +641,7 @@ template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const sse { return _mm_movehl_ps(b, a); } +# endif # if defined(__KERNEL_SSSE3__) __forceinline const ssef shuffle8(const ssef &a, const ssei &shuf) @@ -643,7 +686,16 @@ template<> __forceinline float extract<0>(const ssef &a) template<size_t dst, size_t src, size_t clr> __forceinline const ssef insert(const ssef &a, const ssef &b) { +# ifdef __KERNEL_NEON__ + ssef res = a; + if (clr) + res[dst] = 0; + else + res[dst] = b[src]; + return res; +# else return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); +# endif } template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b) { @@ -703,31 +755,55 @@ __forceinline void transpose( __forceinline const ssef vreduce_min(const ssef &v) { +# ifdef __KERNEL_NEON__ + return vdupq_n_f32(vminvq_f32(v)); +# else ssef h = min(shuffle<1, 0, 3, 2>(v), v); return min(shuffle<2, 3, 0, 1>(h), h); +# endif } __forceinline const ssef vreduce_max(const ssef &v) { +# ifdef __KERNEL_NEON__ + return vdupq_n_f32(vmaxvq_f32(v)); +# else ssef h = max(shuffle<1, 0, 3, 2>(v), v); return max(shuffle<2, 3, 0, 1>(h), h); +# endif } __forceinline const ssef vreduce_add(const ssef &v) { +# ifdef __KERNEL_NEON__ + return vdupq_n_f32(vaddvq_f32(v)); +# else ssef h = shuffle<1, 0, 3, 2>(v) + v; return shuffle<2, 3, 0, 1>(h) + h; +# endif } __forceinline float reduce_min(const ssef &v) { +# ifdef __KERNEL_NEON__ + return vminvq_f32(v); +# else return _mm_cvtss_f32(vreduce_min(v)); +# endif } __forceinline float reduce_max(const ssef &v) { +# ifdef __KERNEL_NEON__ + return vmaxvq_f32(v); +# else return _mm_cvtss_f32(vreduce_max(v)); +# endif } __forceinline float reduce_add(const ssef &v) { +# ifdef __KERNEL_NEON__ + return vaddvq_f32(v); +# else return _mm_cvtss_f32(vreduce_add(v)); +# endif } __forceinline uint32_t select_min(const ssef &v) @@ -942,14 +1018,14 @@ ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf) { /* shuffle value must be a constant, so we need to branch */ if (shuf) - return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(1, 0, 3, 2))); + return shuffle<1, 0, 3, 2>(a); else - return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(3, 2, 1, 0))); + return shuffle<3, 2, 1, 0>(a); } # endif -# ifdef __KERNEL_SSE41__ +# if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__) ccl_device_inline void gen_idirsplat_swap(const ssef &pn, const shuffle_swap_t &shuf_identity, diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h index c03ab18a6df..3ec69ab3700 100644 --- a/intern/cycles/util/util_ssei.h +++ b/intern/cycles/util/util_ssei.h @@ -445,14 +445,22 @@ __forceinline ssei unpackhi(const ssei &a, const ssei &b) template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssei shuffle(const ssei &a) { +# ifdef __KERNEL_NEON__ + return shuffle_neon<ssei, i0, i1, i2, i3>(a); +# else return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)); +# endif } template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssei shuffle(const ssei &a, const ssei &b) { +# ifdef __KERNEL_NEON__ + return shuffle_neon<ssei, i0, i1, i2, i3>(a, b); +# else return _mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); +# endif } template<size_t i0> __forceinline const ssei shuffle(const ssei &b) @@ -505,15 +513,27 @@ __forceinline const ssei vreduce_add(const ssei &v) __forceinline int reduce_min(const ssei &v) { +# ifdef __KERNEL_NEON__ + return vminvq_s32(v); +# else return extract<0>(vreduce_min(v)); +# endif } __forceinline int reduce_max(const ssei &v) { +# ifdef __KERNEL_NEON__ + return vmaxvq_s32(v); +# else return extract<0>(vreduce_max(v)); +# endif } __forceinline int reduce_add(const ssei &v) { +# ifdef __KERNEL_NEON__ + return vaddvq_s32(v); +# else return extract<0>(vreduce_add(v)); +# endif } __forceinline uint32_t select_min(const ssei &v) |