/* * ***** BEGIN GPL LICENSE BLOCK ***** * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * The Original Code is Copyright (C) 2001-2002 by NaN Holding BV. * All rights reserved. * * The Original Code is: some of this file. * * ***** END GPL LICENSE BLOCK ***** * */ /** \file blender/blenlib/intern/math_base_inline.c * \ingroup bli */ #ifndef __MATH_BASE_INLINE_C__ #define __MATH_BASE_INLINE_C__ #include #include #include #ifdef __SSE2__ # include #endif #include "BLI_math_base.h" /* copied from BLI_utildefines.h */ #ifdef __GNUC__ # define UNLIKELY(x) __builtin_expect(!!(x), 0) #else # define UNLIKELY(x) (x) #endif /* powf is really slow for raising to integer powers. */ MINLINE float pow2f(float x) { return x * x; } MINLINE float pow3f(float x) { return pow2f(x) * x; } MINLINE float pow4f(float x) { return pow2f(pow2f(x)); } MINLINE float pow7f(float x) { return pow2f(pow3f(x)) * x; } MINLINE float sqrt3f(float f) { if (UNLIKELY(f == 0.0f)) return 0.0f; else if (UNLIKELY(f < 0.0f)) return -(float)(exp(log(-f) / 3.0)); else return (float)(exp(log( f) / 3.0)); } MINLINE double sqrt3d(double d) { if (UNLIKELY(d == 0.0)) return 0.0; else if (UNLIKELY(d < 0.0)) return -exp(log(-d) / 3.0); else return exp(log( d) / 3.0); } MINLINE float sqrtf_signed(float f) { return (f >= 0.0f) ? sqrtf(f) : -sqrtf(-f); } MINLINE float saacos(float fac) { if (UNLIKELY(fac <= -1.0f)) return (float)M_PI; else if (UNLIKELY(fac >= 1.0f)) return 0.0f; else return acosf(fac); } MINLINE float saasin(float fac) { if (UNLIKELY(fac <= -1.0f)) return (float)-M_PI / 2.0f; else if (UNLIKELY(fac >= 1.0f)) return (float) M_PI / 2.0f; else return asinf(fac); } MINLINE float sasqrt(float fac) { if (UNLIKELY(fac <= 0.0f)) return 0.0f; else return sqrtf(fac); } MINLINE float saacosf(float fac) { if (UNLIKELY(fac <= -1.0f)) return (float)M_PI; else if (UNLIKELY(fac >= 1.0f)) return 0.0f; else return acosf(fac); } MINLINE float saasinf(float fac) { if (UNLIKELY(fac <= -1.0f)) return (float)-M_PI / 2.0f; else if (UNLIKELY(fac >= 1.0f)) return (float) M_PI / 2.0f; else return asinf(fac); } MINLINE float sasqrtf(float fac) { if (UNLIKELY(fac <= 0.0f)) return 0.0f; else return sqrtf(fac); } MINLINE float interpf(float target, float origin, float fac) { return (fac * target) + (1.0f - fac) * origin; } /* used for zoom values*/ MINLINE float power_of_2(float val) { return (float)pow(2.0, ceil(log((double)val) / M_LN2)); } MINLINE int is_power_of_2_i(int n) { return (n & (n - 1)) == 0; } MINLINE int power_of_2_max_i(int n) { if (is_power_of_2_i(n)) return n; do { n = n & (n - 1); } while (!is_power_of_2_i(n)); return n * 2; } MINLINE int power_of_2_min_i(int n) { while (!is_power_of_2_i(n)) n = n & (n - 1); return n; } MINLINE unsigned int power_of_2_max_u(unsigned int x) { x -= 1; x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); return x + 1; } MINLINE unsigned power_of_2_min_u(unsigned x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); return x - (x >> 1); } MINLINE int iroundf(float a) { return (int)floorf(a + 0.5f); } /* integer division that rounds 0.5 up, particularly useful for color blending * with integers, to avoid gradual darkening when rounding down */ MINLINE int divide_round_i(int a, int b) { return (2 * a + b) / (2 * b); } /** * modulo that handles negative numbers, works the same as Python's. */ MINLINE int mod_i(int i, int n) { return (i % n + n) % n; } MINLINE float min_ff(float a, float b) { return (a < b) ? a : b; } MINLINE float max_ff(float a, float b) { return (a > b) ? a : b; } MINLINE int min_ii(int a, int b) { return (a < b) ? a : b; } MINLINE int max_ii(int a, int b) { return (b < a) ? a : b; } MINLINE float min_fff(float a, float b, float c) { return min_ff(min_ff(a, b), c); } MINLINE float max_fff(float a, float b, float c) { return max_ff(max_ff(a, b), c); } MINLINE int min_iii(int a, int b, int c) { return min_ii(min_ii(a, b), c); } MINLINE int max_iii(int a, int b, int c) { return max_ii(max_ii(a, b), c); } MINLINE float min_ffff(float a, float b, float c, float d) { return min_ff(min_fff(a, b, c), d); } MINLINE float max_ffff(float a, float b, float c, float d) { return max_ff(max_fff(a, b, c), d); } MINLINE int min_iiii(int a, int b, int c, int d) { return min_ii(min_iii(a, b, c), d); } MINLINE int max_iiii(int a, int b, int c, int d) { return max_ii(max_iii(a, b, c), d); } /** * Almost-equal for IEEE floats, using absolute difference method. * * \param max_diff the maximum absolute difference. */ MINLINE int compare_ff(float a, float b, const float max_diff) { return fabsf(a - b) <= max_diff; } /** * Almost-equal for IEEE floats, using their integer representation (mixing ULP and absolute difference methods). * * \param max_diff is the maximum absolute difference (allows to take care of the near-zero area, * where relative difference methods cannot really work). * \param max_ulps is the 'maximum number of floats + 1' allowed between \a a and \a b to consider them equal. * * \see https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ */ MINLINE int compare_ff_relative(float a, float b, const float max_diff, const int max_ulps) { union {float f; int i;} ua, ub; #if 0 /* No BLI_assert in INLINE :/ */ BLI_assert(sizeof(float) == sizeof(int)); BLI_assert(max_ulps < (1 << 22)); #endif if (fabsf(a - b) <= max_diff) { return 1; } ua.f = a; ub.f = b; /* Important to compare sign from integers, since (-0.0f < 0) is false * (though this shall not be an issue in common cases)... */ return ((ua.i < 0) != (ub.i < 0)) ? 0 : (abs(ua.i - ub.i) <= max_ulps) ? 1 : 0; } MINLINE float signf(float f) { return (f < 0.f) ? -1.f : 1.f; } MINLINE int signum_i_ex(float a, float eps) { if (a > eps) return 1; if (a < -eps) return -1; else return 0; } MINLINE int signum_i(float a) { if (a > 0.0f) return 1; if (a < 0.0f) return -1; else return 0; } /* Internal helpers for SSE2 implementation. * * NOTE: Are to be called ONLY from inside `#ifdef __SSE2__` !!! */ #ifdef __SSE2__ /* Calculate initial guess for arg^exp based on float representation * This method gives a constant bias, which can be easily compensated by * multiplicating with bias_coeff. * Gives better results for exponents near 1 (e. g. 4/5). * exp = exponent, encoded as uint32_t * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as * uint32_t * * We hope that exp and e2coeff gets properly inlined */ MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp, const int e2coeff, const __m128 arg) { __m128 ret; ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff))); ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp))); ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); return ret; } /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */ MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution( const __m128 old_result, const __m128 x) { __m128 approx2 = _mm_mul_ps(old_result, old_result); __m128 approx4 = _mm_mul_ps(approx2, approx2); __m128 t = _mm_div_ps(x, approx4); __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */ return _mm_mul_ps(summ, _mm_set1_ps(1.0f / 5.0f)); } /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */ MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg) { /* max, avg and |avg| errors were calculated in gcc without FMA instructions * The final precision should be better than powf in glibc */ /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize * avg error. */ /* 0x3F4CCCCD = 4/5 */ /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */ /* error max = 0.17 avg = 0.0018 |avg| = 0.05 */ __m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg); __m128 arg2 = _mm_mul_ps(arg, arg); __m128 arg4 = _mm_mul_ps(arg2, arg2); /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */ x = _bli_math_improve_5throot_solution(x, arg4); /* error max = 0.00021 avg = 1.6e-05 |avg| = 1.6e-05 */ x = _bli_math_improve_5throot_solution(x, arg4); /* error max = 6.1e-07 avg = 5.2e-08 |avg| = 1.1e-07 */ x = _bli_math_improve_5throot_solution(x, arg4); return _mm_mul_ps(x, _mm_mul_ps(x, x)); } /* Calculate powf(x, 1.0f / 2.4) */ MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg) { /* 5/12 is too small, so compute the 4th root of 20/12 instead. * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 */ __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg); __m128 xover = _mm_mul_ps(arg, xf); __m128 xfm1 = _mm_rsqrt_ps(xf); __m128 x2 = _mm_mul_ps(arg, arg); __m128 xunder = _mm_mul_ps(x2, xfm1); /* sqrt2 * over + 2 * sqrt2 * under */ __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _mm_add_ps(xover, xunder)); xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); return xavg; } MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask, const __m128 a, const __m128 b) { return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b)); } #endif /* __SSE2__ */ #endif /* __MATH_BASE_INLINE_C__ */