Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'source/blender/blenlib/intern/math_base_inline.c')
-rw-r--r--source/blender/blenlib/intern/math_base_inline.c98
1 files changed, 98 insertions, 0 deletions
diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c
index f70c12bd26e..7ab2b784812 100644
--- a/source/blender/blenlib/intern/math_base_inline.c
+++ b/source/blender/blenlib/intern/math_base_inline.c
@@ -35,6 +35,10 @@
#include <stdlib.h>
#include <string.h>
+#ifdef __SSE2__
+# include <emmintrin.h>
+#endif
+
#include "BLI_math_base.h"
/* copied from BLI_utildefines.h */
@@ -311,4 +315,98 @@ MINLINE int signum_i(float a)
else return 0;
}
+/* Internal helpers for SSE2 implementation.
+ *
+ * NOTE: Are to be called ONLY from inside `#ifdef __SSE2__` !!!
+ */
+
+#ifdef __SSE2__
+
+/* Calculate initial guess for arg^exp based on float representation
+ * This method gives a constant bias, which can be easily compensated by
+ * multiplicating with bias_coeff.
+ * Gives better results for exponents near 1 (e. g. 4/5).
+ * exp = exponent, encoded as uint32_t
+ * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as
+ * uint32_t
+ *
+ * We hope that exp and e2coeff gets properly inlined
+ */
+MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp,
+ const int e2coeff,
+ const __m128 arg)
+{
+ __m128 ret;
+ ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff)));
+ ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
+ ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp)));
+ ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
+ return ret;
+}
+
+/* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
+MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution(
+ const __m128 old_result,
+ const __m128 x)
+{
+ __m128 approx2 = _mm_mul_ps(old_result, old_result);
+ __m128 approx4 = _mm_mul_ps(approx2, approx2);
+ __m128 t = _mm_div_ps(x, approx4);
+ __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */
+ return _mm_mul_ps(summ, _mm_set1_ps(1.0f/5.0f));
+}
+
+/* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
+MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
+{
+ /* max, avg and |avg| errors were calculated in gcc without FMA instructions
+ * The final precision should be better than powf in glibc */
+
+ /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize
+ * avg error.
+ */
+ /* 0x3F4CCCCD = 4/5 */
+ /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
+ /* error max = 0.17 avg = 0.0018 |avg| = 0.05 */
+ __m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg);
+ __m128 arg2 = _mm_mul_ps(arg, arg);
+ __m128 arg4 = _mm_mul_ps(arg2, arg2);
+ /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */
+ x = _bli_math_improve_5throot_solution(x, arg4);
+ /* error max = 0.00021 avg = 1.6e-05 |avg| = 1.6e-05 */
+ x = _bli_math_improve_5throot_solution(x, arg4);
+ /* error max = 6.1e-07 avg = 5.2e-08 |avg| = 1.1e-07 */
+ x = _bli_math_improve_5throot_solution(x, arg4);
+ return _mm_mul_ps(x, _mm_mul_ps(x, x));
+}
+
+/* Calculate powf(x, 1.0f / 2.4) */
+MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
+{
+ /* 5/12 is too small, so compute the 4th root of 20/12 instead.
+ * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
+ * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
+ */
+ __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
+ __m128 xover = _mm_mul_ps(arg, xf);
+ __m128 xfm1 = _mm_rsqrt_ps(xf);
+ __m128 x2 = _mm_mul_ps(arg, arg);
+ __m128 xunder = _mm_mul_ps(x2, xfm1);
+ /* sqrt2 * over + 2 * sqrt2 * under */
+ __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
+ _mm_add_ps(xover, xunder));
+ xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+ xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+ return xavg;
+}
+
+MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask,
+ const __m128 a,
+ const __m128 b)
+{
+ return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
+}
+
+#endif /* __SSE2__ */
+
#endif /* __MATH_BASE_INLINE_C__ */