diff options
Diffstat (limited to 'source/blender/blenlib/intern')
-rw-r--r-- | source/blender/blenlib/intern/math_base_inline.c | 20 |
1 files changed, 17 insertions, 3 deletions
diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c index a983821f15e..4a213f5fe74 100644 --- a/source/blender/blenlib/intern/math_base_inline.c +++ b/source/blender/blenlib/intern/math_base_inline.c @@ -767,6 +767,20 @@ MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg) return _mm_mul_ps(x, _mm_mul_ps(x, x)); } +MALWAYS_INLINE __m128 _bli_math_rsqrt(__m128 in) +{ + __m128 r = _mm_rsqrt_ps(in); + /* Only do additional Newton-Raphson iterations when using actual SSE + * code path. When we are emulating SSE on NEON via sse2neon, the + * additional NR iterations are already done inside _mm_rsqrt_ps + * emulation. */ +# if defined(__SSE2__) + r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), + _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(in, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +# endif + return r; +} + /* Calculate powf(x, 1.0f / 2.4) */ MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg) { @@ -776,14 +790,14 @@ MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg) */ __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg); __m128 xover = _mm_mul_ps(arg, xf); - __m128 xfm1 = _mm_rsqrt_ps(xf); + __m128 xfm1 = _bli_math_rsqrt(xf); __m128 x2 = _mm_mul_ps(arg, arg); __m128 xunder = _mm_mul_ps(x2, xfm1); /* sqrt2 * over + 2 * sqrt2 * under */ __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _mm_add_ps(xover, xunder)); - xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); - xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); + xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg)); + xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg)); return xavg; } |