diff options
author | Campbell Barton <ideasman42@gmail.com> | 2019-04-17 07:17:24 +0300 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2019-04-17 07:21:24 +0300 |
commit | e12c08e8d170b7ca40f204a5b0423c23a9fbc2c1 (patch) | |
tree | 8cf3453d12edb177a218ef8009357518ec6cab6a /source/blender/blenlib/intern/math_base_inline.c | |
parent | b3dabc200a4b0399ec6b81f2ff2730d07b44fcaa (diff) |
ClangFormat: apply to source, most of intern
Apply clang format as proposed in T53211.
For details on usage and instructions for migrating branches
without conflicts, see:
https://wiki.blender.org/wiki/Tools/ClangFormat
Diffstat (limited to 'source/blender/blenlib/intern/math_base_inline.c')
-rw-r--r-- | source/blender/blenlib/intern/math_base_inline.c | 538 |
1 files changed, 315 insertions, 223 deletions
diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c index b8a5a138eb4..f1ceaca4eb1 100644 --- a/source/blender/blenlib/intern/math_base_inline.c +++ b/source/blender/blenlib/intern/math_base_inline.c @@ -40,196 +40,269 @@ /* copied from BLI_utildefines.h */ #ifdef __GNUC__ -# define UNLIKELY(x) __builtin_expect(!!(x), 0) +# define UNLIKELY(x) __builtin_expect(!!(x), 0) #else -# define UNLIKELY(x) (x) +# define UNLIKELY(x) (x) #endif /* powf is really slow for raising to integer powers. */ MINLINE float pow2f(float x) { - return x * x; + return x * x; } MINLINE float pow3f(float x) { - return pow2f(x) * x; + return pow2f(x) * x; } MINLINE float pow4f(float x) { - return pow2f(pow2f(x)); + return pow2f(pow2f(x)); } MINLINE float pow7f(float x) { - return pow2f(pow3f(x)) * x; + return pow2f(pow3f(x)) * x; } MINLINE float sqrt3f(float f) { - if (UNLIKELY(f == 0.0f)) { return 0.0f; } - else if (UNLIKELY(f < 0.0f)) { return -(float)(exp(log(-f) / 3.0)); } - else { return (float)(exp(log( f) / 3.0)); } + if (UNLIKELY(f == 0.0f)) { + return 0.0f; + } + else if (UNLIKELY(f < 0.0f)) { + return -(float)(exp(log(-f) / 3.0)); + } + else { + return (float)(exp(log(f) / 3.0)); + } } MINLINE double sqrt3d(double d) { - if (UNLIKELY(d == 0.0)) { return 0.0; } - else if (UNLIKELY(d < 0.0)) { return -exp(log(-d) / 3.0); } - else { return exp(log( d) / 3.0); } + if (UNLIKELY(d == 0.0)) { + return 0.0; + } + else if (UNLIKELY(d < 0.0)) { + return -exp(log(-d) / 3.0); + } + else { + return exp(log(d) / 3.0); + } } MINLINE float sqrtf_signed(float f) { - return (f >= 0.0f) ? sqrtf(f) : -sqrtf(-f); + return (f >= 0.0f) ? sqrtf(f) : -sqrtf(-f); } MINLINE float saacos(float fac) { - if (UNLIKELY(fac <= -1.0f)) { return (float)M_PI; } - else if (UNLIKELY(fac >= 1.0f)) { return 0.0f; } - else { return acosf(fac); } + if (UNLIKELY(fac <= -1.0f)) { + return (float)M_PI; + } + else if (UNLIKELY(fac >= 1.0f)) { + return 0.0f; + } + else { + return acosf(fac); + } } MINLINE float saasin(float fac) { - if (UNLIKELY(fac <= -1.0f)) { return (float)-M_PI / 2.0f; } - else if (UNLIKELY(fac >= 1.0f)) { return (float) M_PI / 2.0f; } - else { return asinf(fac); } + if (UNLIKELY(fac <= -1.0f)) { + return (float)-M_PI / 2.0f; + } + else if (UNLIKELY(fac >= 1.0f)) { + return (float)M_PI / 2.0f; + } + else { + return asinf(fac); + } } MINLINE float sasqrt(float fac) { - if (UNLIKELY(fac <= 0.0f)) { return 0.0f; } - else { return sqrtf(fac); } + if (UNLIKELY(fac <= 0.0f)) { + return 0.0f; + } + else { + return sqrtf(fac); + } } MINLINE float saacosf(float fac) { - if (UNLIKELY(fac <= -1.0f)) { return (float)M_PI; } - else if (UNLIKELY(fac >= 1.0f)) { return 0.0f; } - else { return acosf(fac); } + if (UNLIKELY(fac <= -1.0f)) { + return (float)M_PI; + } + else if (UNLIKELY(fac >= 1.0f)) { + return 0.0f; + } + else { + return acosf(fac); + } } MINLINE float saasinf(float fac) { - if (UNLIKELY(fac <= -1.0f)) { return (float)-M_PI / 2.0f; } - else if (UNLIKELY(fac >= 1.0f)) { return (float) M_PI / 2.0f; } - else { return asinf(fac); } + if (UNLIKELY(fac <= -1.0f)) { + return (float)-M_PI / 2.0f; + } + else if (UNLIKELY(fac >= 1.0f)) { + return (float)M_PI / 2.0f; + } + else { + return asinf(fac); + } } MINLINE float sasqrtf(float fac) { - if (UNLIKELY(fac <= 0.0f)) { return 0.0f; } - else { return sqrtf(fac); } + if (UNLIKELY(fac <= 0.0f)) { + return 0.0f; + } + else { + return sqrtf(fac); + } } MINLINE float interpf(float target, float origin, float fac) { - return (fac * target) + (1.0f - fac) * origin; + return (fac * target) + (1.0f - fac) * origin; } /* used for zoom values*/ MINLINE float power_of_2(float val) { - return (float)pow(2.0, ceil(log((double)val) / M_LN2)); + return (float)pow(2.0, ceil(log((double)val) / M_LN2)); } MINLINE int is_power_of_2_i(int n) { - return (n & (n - 1)) == 0; + return (n & (n - 1)) == 0; } MINLINE int power_of_2_max_i(int n) { - if (is_power_of_2_i(n)) { - return n; - } + if (is_power_of_2_i(n)) { + return n; + } - do { - n = n & (n - 1); - } while (!is_power_of_2_i(n)); + do { + n = n & (n - 1); + } while (!is_power_of_2_i(n)); - return n * 2; + return n * 2; } MINLINE int power_of_2_min_i(int n) { - while (!is_power_of_2_i(n)) { - n = n & (n - 1); - } + while (!is_power_of_2_i(n)) { + n = n & (n - 1); + } - return n; + return n; } MINLINE unsigned int power_of_2_max_u(unsigned int x) { - x -= 1; - x |= (x >> 1); - x |= (x >> 2); - x |= (x >> 4); - x |= (x >> 8); - x |= (x >> 16); - return x + 1; + x -= 1; + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + return x + 1; } MINLINE unsigned power_of_2_min_u(unsigned x) { - x |= (x >> 1); - x |= (x >> 2); - x |= (x >> 4); - x |= (x >> 8); - x |= (x >> 16); - return x - (x >> 1); + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + return x - (x >> 1); } /* rounding and clamping */ -#define _round_clamp_fl_impl(arg, ty, min, max) { \ - float r = floorf(arg + 0.5f); \ - if (UNLIKELY(r <= (float)min)) return (ty)min; \ - else if (UNLIKELY(r >= (float)max)) return (ty)max; \ - else return (ty)r; \ +#define _round_clamp_fl_impl(arg, ty, min, max) \ + { \ + float r = floorf(arg + 0.5f); \ + if (UNLIKELY(r <= (float)min)) \ + return (ty)min; \ + else if (UNLIKELY(r >= (float)max)) \ + return (ty)max; \ + else \ + return (ty)r; \ + } + +#define _round_clamp_db_impl(arg, ty, min, max) \ + { \ + double r = floor(arg + 0.5); \ + if (UNLIKELY(r <= (double)min)) \ + return (ty)min; \ + else if (UNLIKELY(r >= (double)max)) \ + return (ty)max; \ + else \ + return (ty)r; \ + } + +#define _round_fl_impl(arg, ty) \ + { \ + return (ty)floorf(arg + 0.5f); \ + } +#define _round_db_impl(arg, ty) \ + { \ + return (ty)floor(arg + 0.5); \ + } + +MINLINE signed char round_fl_to_char(float a){_round_fl_impl(a, signed char)} MINLINE + unsigned char round_fl_to_uchar(float a){_round_fl_impl(a, unsigned char)} MINLINE + short round_fl_to_short(float a){_round_fl_impl(a, short)} MINLINE + unsigned short round_fl_to_ushort(float a){_round_fl_impl(a, unsigned short)} MINLINE + int round_fl_to_int(float a){_round_fl_impl(a, int)} MINLINE + unsigned int round_fl_to_uint(float a){_round_fl_impl(a, unsigned int)} + +MINLINE signed char round_db_to_char(double a){_round_db_impl(a, signed char)} MINLINE + unsigned char round_db_to_uchar(double a){_round_db_impl(a, unsigned char)} MINLINE + short round_db_to_short(double a){_round_db_impl(a, short)} MINLINE + unsigned short round_db_to_ushort(double a){_round_db_impl(a, unsigned short)} MINLINE + int round_db_to_int(double a){_round_db_impl(a, int)} MINLINE + unsigned int round_db_to_uint(double a) +{ + _round_db_impl(a, unsigned int) } -#define _round_clamp_db_impl(arg, ty, min, max) { \ - double r = floor(arg + 0.5); \ - if (UNLIKELY(r <= (double)min)) return (ty)min; \ - else if (UNLIKELY(r >= (double)max)) return (ty)max; \ - else return (ty)r; \ -} - -#define _round_fl_impl(arg, ty) { return (ty)floorf(arg + 0.5f); } -#define _round_db_impl(arg, ty) { return (ty)floor(arg + 0.5); } - -MINLINE signed char round_fl_to_char(float a) { _round_fl_impl(a, signed char) } -MINLINE unsigned char round_fl_to_uchar(float a) { _round_fl_impl(a, unsigned char) } -MINLINE short round_fl_to_short(float a) { _round_fl_impl(a, short) } -MINLINE unsigned short round_fl_to_ushort(float a) { _round_fl_impl(a, unsigned short) } -MINLINE int round_fl_to_int(float a) { _round_fl_impl(a, int) } -MINLINE unsigned int round_fl_to_uint(float a) { _round_fl_impl(a, unsigned int) } - -MINLINE signed char round_db_to_char(double a) { _round_db_impl(a, signed char) } -MINLINE unsigned char round_db_to_uchar(double a) { _round_db_impl(a, unsigned char) } -MINLINE short round_db_to_short(double a) { _round_db_impl(a, short) } -MINLINE unsigned short round_db_to_ushort(double a) { _round_db_impl(a, unsigned short) } -MINLINE int round_db_to_int(double a) { _round_db_impl(a, int) } -MINLINE unsigned int round_db_to_uint(double a) { _round_db_impl(a, unsigned int) } - #undef _round_fl_impl #undef _round_db_impl -MINLINE signed char round_fl_to_char_clamp(float a) { _round_clamp_fl_impl(a, signed char, SCHAR_MIN, SCHAR_MAX) } -MINLINE unsigned char round_fl_to_uchar_clamp(float a) { _round_clamp_fl_impl(a, unsigned char, 0, UCHAR_MAX) } -MINLINE short round_fl_to_short_clamp(float a) { _round_clamp_fl_impl(a, short, SHRT_MIN, SHRT_MAX) } -MINLINE unsigned short round_fl_to_ushort_clamp(float a) { _round_clamp_fl_impl(a, unsigned short, 0, USHRT_MAX) } -MINLINE int round_fl_to_int_clamp(float a) { _round_clamp_fl_impl(a, int, INT_MIN, INT_MAX) } -MINLINE unsigned int round_fl_to_uint_clamp(float a) { _round_clamp_fl_impl(a, unsigned int, 0, UINT_MAX) } - -MINLINE signed char round_db_to_char_clamp(double a) { _round_clamp_db_impl(a, signed char, SCHAR_MIN, SCHAR_MAX) } -MINLINE unsigned char round_db_to_uchar_clamp(double a) { _round_clamp_db_impl(a, unsigned char, 0, UCHAR_MAX) } -MINLINE short round_db_to_short_clamp(double a) { _round_clamp_db_impl(a, short, SHRT_MIN, SHRT_MAX) } -MINLINE unsigned short round_db_to_ushort_clamp(double a) { _round_clamp_db_impl(a, unsigned short, 0, USHRT_MAX) } -MINLINE int round_db_to_int_clamp(double a) { _round_clamp_db_impl(a, int, INT_MIN, INT_MAX) } -MINLINE unsigned int round_db_to_uint_clamp(double a) { _round_clamp_db_impl(a, unsigned int, 0, UINT_MAX) } +MINLINE signed char round_fl_to_char_clamp(float a){ + _round_clamp_fl_impl(a, signed char, SCHAR_MIN, SCHAR_MAX)} MINLINE + unsigned char round_fl_to_uchar_clamp(float a){ + _round_clamp_fl_impl(a, unsigned char, 0, UCHAR_MAX)} MINLINE + short round_fl_to_short_clamp(float a){ + _round_clamp_fl_impl(a, short, SHRT_MIN, SHRT_MAX)} MINLINE + unsigned short round_fl_to_ushort_clamp(float a){ + _round_clamp_fl_impl(a, unsigned short, 0, USHRT_MAX)} MINLINE + int round_fl_to_int_clamp(float a){_round_clamp_fl_impl(a, int, INT_MIN, INT_MAX)} MINLINE + unsigned int round_fl_to_uint_clamp(float a){ + _round_clamp_fl_impl(a, unsigned int, 0, UINT_MAX)} + +MINLINE signed char round_db_to_char_clamp(double a){ + _round_clamp_db_impl(a, signed char, SCHAR_MIN, SCHAR_MAX)} MINLINE + unsigned char round_db_to_uchar_clamp(double a){ + _round_clamp_db_impl(a, unsigned char, 0, UCHAR_MAX)} MINLINE + short round_db_to_short_clamp(double a){ + _round_clamp_db_impl(a, short, SHRT_MIN, SHRT_MAX)} MINLINE + unsigned short round_db_to_ushort_clamp(double a){ + _round_clamp_db_impl(a, unsigned short, 0, USHRT_MAX)} MINLINE + int round_db_to_int_clamp(double a){_round_clamp_db_impl(a, int, INT_MIN, INT_MAX)} MINLINE + unsigned int round_db_to_uint_clamp(double a) +{ + _round_clamp_db_impl(a, unsigned int, 0, UINT_MAX) +} #undef _round_clamp_fl_impl #undef _round_clamp_db_impl @@ -238,7 +311,7 @@ MINLINE unsigned int round_db_to_uint_clamp(double a) { _round_clamp_db_impl(a * with integers, to avoid gradual darkening when rounding down */ MINLINE int divide_round_i(int a, int b) { - return (2 * a + b) / (2 * b); + return (2 * a + b) / (2 * b); } /** @@ -247,9 +320,9 @@ MINLINE int divide_round_i(int a, int b) */ MINLINE int divide_floor_i(int a, int b) { - int d = a / b; - int r = a % b; /* Optimizes into a single division. */ - return r ? d - ((a < 0) ^ (b < 0)) : d; + int d = a / b; + int r = a % b; /* Optimizes into a single division. */ + return r ? d - ((a < 0) ^ (b < 0)) : d; } /** @@ -257,91 +330,91 @@ MINLINE int divide_floor_i(int a, int b) */ MINLINE int mod_i(int i, int n) { - return (i % n + n) % n; + return (i % n + n) % n; } MINLINE float min_ff(float a, float b) { - return (a < b) ? a : b; + return (a < b) ? a : b; } MINLINE float max_ff(float a, float b) { - return (a > b) ? a : b; + return (a > b) ? a : b; } MINLINE int min_ii(int a, int b) { - return (a < b) ? a : b; + return (a < b) ? a : b; } MINLINE int max_ii(int a, int b) { - return (b < a) ? a : b; + return (b < a) ? a : b; } MINLINE float min_fff(float a, float b, float c) { - return min_ff(min_ff(a, b), c); + return min_ff(min_ff(a, b), c); } MINLINE float max_fff(float a, float b, float c) { - return max_ff(max_ff(a, b), c); + return max_ff(max_ff(a, b), c); } MINLINE int min_iii(int a, int b, int c) { - return min_ii(min_ii(a, b), c); + return min_ii(min_ii(a, b), c); } MINLINE int max_iii(int a, int b, int c) { - return max_ii(max_ii(a, b), c); + return max_ii(max_ii(a, b), c); } MINLINE float min_ffff(float a, float b, float c, float d) { - return min_ff(min_fff(a, b, c), d); + return min_ff(min_fff(a, b, c), d); } MINLINE float max_ffff(float a, float b, float c, float d) { - return max_ff(max_fff(a, b, c), d); + return max_ff(max_fff(a, b, c), d); } MINLINE int min_iiii(int a, int b, int c, int d) { - return min_ii(min_iii(a, b, c), d); + return min_ii(min_iii(a, b, c), d); } MINLINE int max_iiii(int a, int b, int c, int d) { - return max_ii(max_iii(a, b, c), d); + return max_ii(max_iii(a, b, c), d); } MINLINE size_t min_zz(size_t a, size_t b) { - return (a < b) ? a : b; + return (a < b) ? a : b; } MINLINE size_t max_zz(size_t a, size_t b) { - return (b < a) ? a : b; + return (b < a) ? a : b; } MINLINE int clamp_i(int value, int min, int max) { - return min_ii(max_ii(value, min), max); + return min_ii(max_ii(value, min), max); } MINLINE float clamp_f(float value, float min, float max) { - if (value > max) { - return max; - } - else if (value < min) { - return min; - } - return value; + if (value > max) { + return max; + } + else if (value < min) { + return min; + } + return value; } MINLINE size_t clamp_z(size_t value, size_t min, size_t max) { - return min_zz(max_zz(value, min), max); + return min_zz(max_zz(value, min), max); } /** @@ -351,7 +424,7 @@ MINLINE size_t clamp_z(size_t value, size_t min, size_t max) */ MINLINE int compare_ff(float a, float b, const float max_diff) { - return fabsf(a - b) <= max_diff; + return fabsf(a - b) <= max_diff; } /** @@ -365,59 +438,74 @@ MINLINE int compare_ff(float a, float b, const float max_diff) */ MINLINE int compare_ff_relative(float a, float b, const float max_diff, const int max_ulps) { - union {float f; int i;} ua, ub; + union { + float f; + int i; + } ua, ub; - BLI_assert(sizeof(float) == sizeof(int)); - BLI_assert(max_ulps < (1 << 22)); + BLI_assert(sizeof(float) == sizeof(int)); + BLI_assert(max_ulps < (1 << 22)); - if (fabsf(a - b) <= max_diff) { - return 1; - } + if (fabsf(a - b) <= max_diff) { + return 1; + } - ua.f = a; - ub.f = b; + ua.f = a; + ub.f = b; - /* Important to compare sign from integers, since (-0.0f < 0) is false - * (though this shall not be an issue in common cases)... */ - return ((ua.i < 0) != (ub.i < 0)) ? 0 : (abs(ua.i - ub.i) <= max_ulps) ? 1 : 0; + /* Important to compare sign from integers, since (-0.0f < 0) is false + * (though this shall not be an issue in common cases)... */ + return ((ua.i < 0) != (ub.i < 0)) ? 0 : (abs(ua.i - ub.i) <= max_ulps) ? 1 : 0; } MINLINE float signf(float f) { - return (f < 0.f) ? -1.f : 1.f; + return (f < 0.f) ? -1.f : 1.f; } MINLINE int signum_i_ex(float a, float eps) { - if (a > eps) { return 1; } - if (a < -eps) { return -1; } - else { return 0; } + if (a > eps) { + return 1; + } + if (a < -eps) { + return -1; + } + else { + return 0; + } } MINLINE int signum_i(float a) { - if (a > 0.0f) { return 1; } - if (a < 0.0f) { return -1; } - else { return 0; } + if (a > 0.0f) { + return 1; + } + if (a < 0.0f) { + return -1; + } + else { + return 0; + } } /** Returns number of (base ten) *significant* digits of integer part of given float * (negative in case of decimal-only floats, 0.01 returns -1 e.g.). */ MINLINE int integer_digits_f(const float f) { - return (f == 0.0f) ? 0 : (int)floor(log10(fabs(f))) + 1; + return (f == 0.0f) ? 0 : (int)floor(log10(fabs(f))) + 1; } /** Returns number of (base ten) *significant* digits of integer part of given double * (negative in case of decimal-only floats, 0.01 returns -1 e.g.). */ MINLINE int integer_digits_d(const double d) { - return (d == 0.0) ? 0 : (int)floor(log10(fabs(d))) + 1; + return (d == 0.0) ? 0 : (int)floor(log10(fabs(d))) + 1; } MINLINE int integer_digits_i(const int i) { - return (int)log10((double)i) + 1; + return (int)log10((double)i) + 1; } /* Internal helpers for SSE2 implementation. @@ -437,112 +525,116 @@ MINLINE int integer_digits_i(const int i) * * We hope that exp and e2coeff gets properly inlined */ -MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp, - const int e2coeff, - const __m128 arg) +MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp, const int e2coeff, const __m128 arg) { - __m128 ret; - ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff))); - ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); - ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp))); - ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); - return ret; + __m128 ret; + ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff))); + ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); + ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp))); + ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); + return ret; } /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */ -MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution( - const __m128 old_result, - const __m128 x) +MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution(const __m128 old_result, const __m128 x) { - __m128 approx2 = _mm_mul_ps(old_result, old_result); - __m128 approx4 = _mm_mul_ps(approx2, approx2); - __m128 t = _mm_div_ps(x, approx4); - __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */ - return _mm_mul_ps(summ, _mm_set1_ps(1.0f / 5.0f)); + __m128 approx2 = _mm_mul_ps(old_result, old_result); + __m128 approx4 = _mm_mul_ps(approx2, approx2); + __m128 t = _mm_div_ps(x, approx4); + __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */ + return _mm_mul_ps(summ, _mm_set1_ps(1.0f / 5.0f)); } /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */ MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg) { - /* max, avg and |avg| errors were calculated in gcc without FMA instructions - * The final precision should be better than powf in glibc */ - - /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize - * avg error. - */ - /* 0x3F4CCCCD = 4/5 */ - /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */ - /* error max = 0.17, avg = 0.0018, |avg| = 0.05 */ - __m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg); - __m128 arg2 = _mm_mul_ps(arg, arg); - __m128 arg4 = _mm_mul_ps(arg2, arg2); - /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */ - x = _bli_math_improve_5throot_solution(x, arg4); - /* error max = 0.00021 avg = 1.6e-05 |avg| = 1.6e-05 */ - x = _bli_math_improve_5throot_solution(x, arg4); - /* error max = 6.1e-07 avg = 5.2e-08 |avg| = 1.1e-07 */ - x = _bli_math_improve_5throot_solution(x, arg4); - return _mm_mul_ps(x, _mm_mul_ps(x, x)); + /* max, avg and |avg| errors were calculated in gcc without FMA instructions + * The final precision should be better than powf in glibc */ + + /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize + * avg error. + */ + /* 0x3F4CCCCD = 4/5 */ + /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */ + /* error max = 0.17, avg = 0.0018, |avg| = 0.05 */ + __m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg); + __m128 arg2 = _mm_mul_ps(arg, arg); + __m128 arg4 = _mm_mul_ps(arg2, arg2); + /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */ + x = _bli_math_improve_5throot_solution(x, arg4); + /* error max = 0.00021 avg = 1.6e-05 |avg| = 1.6e-05 */ + x = _bli_math_improve_5throot_solution(x, arg4); + /* error max = 6.1e-07 avg = 5.2e-08 |avg| = 1.1e-07 */ + x = _bli_math_improve_5throot_solution(x, arg4); + return _mm_mul_ps(x, _mm_mul_ps(x, x)); } /* Calculate powf(x, 1.0f / 2.4) */ MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg) { - /* 5/12 is too small, so compute the 4th root of 20/12 instead. - * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. - * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 - */ - __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg); - __m128 xover = _mm_mul_ps(arg, xf); - __m128 xfm1 = _mm_rsqrt_ps(xf); - __m128 x2 = _mm_mul_ps(arg, arg); - __m128 xunder = _mm_mul_ps(x2, xfm1); - /* sqrt2 * over + 2 * sqrt2 * under */ - __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), - _mm_add_ps(xover, xunder)); - xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); - xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); - return xavg; + /* 5/12 is too small, so compute the 4th root of 20/12 instead. + * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. + * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 + */ + __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg); + __m128 xover = _mm_mul_ps(arg, xf); + __m128 xfm1 = _mm_rsqrt_ps(xf); + __m128 x2 = _mm_mul_ps(arg, arg); + __m128 xunder = _mm_mul_ps(x2, xfm1); + /* sqrt2 * over + 2 * sqrt2 * under */ + __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), + _mm_add_ps(xover, xunder)); + xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); + xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); + return xavg; } -MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask, - const __m128 a, - const __m128 b) +MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask, const __m128 a, const __m128 b) { - return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b)); + return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b)); } -#endif /* __SSE2__ */ +#endif /* __SSE2__ */ /* Low level conversion functions */ MINLINE unsigned char unit_float_to_uchar_clamp(float val) { - return (unsigned char)(((val <= 0.0f) ? 0 : ((val > (1.0f - 0.5f / 255.0f)) ? 255 : ((255.0f * val) + 0.5f)))); + return (unsigned char)(( + (val <= 0.0f) ? 0 : ((val > (1.0f - 0.5f / 255.0f)) ? 255 : ((255.0f * val) + 0.5f)))); } -#define unit_float_to_uchar_clamp(val) ((CHECK_TYPE_INLINE(val, float)), unit_float_to_uchar_clamp(val)) +#define unit_float_to_uchar_clamp(val) \ + ((CHECK_TYPE_INLINE(val, float)), unit_float_to_uchar_clamp(val)) MINLINE unsigned short unit_float_to_ushort_clamp(float val) { - return (unsigned short)((val >= 1.0f - 0.5f / 65535) ? 65535 : (val <= 0.0f) ? 0 : (val * 65535.0f + 0.5f)); + return (unsigned short)((val >= 1.0f - 0.5f / 65535) ? + 65535 : + (val <= 0.0f) ? 0 : (val * 65535.0f + 0.5f)); } -#define unit_float_to_ushort_clamp(val) ((CHECK_TYPE_INLINE(val, float)), unit_float_to_ushort_clamp(val)) +#define unit_float_to_ushort_clamp(val) \ + ((CHECK_TYPE_INLINE(val, float)), unit_float_to_ushort_clamp(val)) MINLINE unsigned char unit_ushort_to_uchar(unsigned short val) { - return (unsigned char)(((val) >= 65535 - 128) ? 255 : ((val) + 128) >> 8); -} -#define unit_ushort_to_uchar(val) ((CHECK_TYPE_INLINE(val, unsigned short)), unit_ushort_to_uchar(val)) - -#define unit_float_to_uchar_clamp_v3(v1, v2) { \ - (v1)[0] = unit_float_to_uchar_clamp((v2[0])); \ - (v1)[1] = unit_float_to_uchar_clamp((v2[1])); \ - (v1)[2] = unit_float_to_uchar_clamp((v2[2])); \ -} ((void)0) -#define unit_float_to_uchar_clamp_v4(v1, v2) { \ - (v1)[0] = unit_float_to_uchar_clamp((v2[0])); \ - (v1)[1] = unit_float_to_uchar_clamp((v2[1])); \ - (v1)[2] = unit_float_to_uchar_clamp((v2[2])); \ - (v1)[3] = unit_float_to_uchar_clamp((v2[3])); \ -} ((void)0) + return (unsigned char)(((val) >= 65535 - 128) ? 255 : ((val) + 128) >> 8); +} +#define unit_ushort_to_uchar(val) \ + ((CHECK_TYPE_INLINE(val, unsigned short)), unit_ushort_to_uchar(val)) + +#define unit_float_to_uchar_clamp_v3(v1, v2) \ + { \ + (v1)[0] = unit_float_to_uchar_clamp((v2[0])); \ + (v1)[1] = unit_float_to_uchar_clamp((v2[1])); \ + (v1)[2] = unit_float_to_uchar_clamp((v2[2])); \ + } \ + ((void)0) +#define unit_float_to_uchar_clamp_v4(v1, v2) \ + { \ + (v1)[0] = unit_float_to_uchar_clamp((v2[0])); \ + (v1)[1] = unit_float_to_uchar_clamp((v2[1])); \ + (v1)[2] = unit_float_to_uchar_clamp((v2[2])); \ + (v1)[3] = unit_float_to_uchar_clamp((v2[3])); \ + } \ + ((void)0) #endif /* __MATH_BASE_INLINE_C__ */ |