/*
 * ***** BEGIN GPL LICENSE BLOCK *****
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 * The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
 * All rights reserved.
 *
 * The Original Code is: some of this file.
 *
 * ***** END GPL LICENSE BLOCK *****
 * */

/** \file blender/blenlib/intern/math_base_inline.c
 *  \ingroup bli
 */

#ifndef __MATH_BASE_INLINE_C__
#define __MATH_BASE_INLINE_C__

#include <float.h>
#include <stdio.h>
#include <stdlib.h>

#ifdef __SSE2__
#  include <emmintrin.h>
#endif

#include "BLI_math_base.h"

/* copied from BLI_utildefines.h */
#ifdef __GNUC__
#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
#else
#  define UNLIKELY(x)     (x)
#endif

/* powf is really slow for raising to integer powers. */
MINLINE float pow2f(float x)
{
	return x * x;
}
MINLINE float pow3f(float x)
{
	return pow2f(x) * x;
}
MINLINE float pow4f(float x)
{
	return pow2f(pow2f(x));
}
MINLINE float pow7f(float x)
{
	return pow2f(pow3f(x)) * x;
}

MINLINE float sqrt3f(float f)
{
	if      (UNLIKELY(f == 0.0f)) return 0.0f;
	else if (UNLIKELY(f <  0.0f)) return -(float)(exp(log(-f) / 3.0));
	else                          return  (float)(exp(log( f) / 3.0));
}

MINLINE double sqrt3d(double d)
{
	if      (UNLIKELY(d == 0.0)) return 0.0;
	else if (UNLIKELY(d <  0.0)) return -exp(log(-d) / 3.0);
	else                         return  exp(log( d) / 3.0);
}

MINLINE float sqrtf_signed(float f)
{
	return (f >= 0.0f) ? sqrtf(f) : -sqrtf(-f);
}

MINLINE float saacos(float fac)
{
	if      (UNLIKELY(fac <= -1.0f)) return (float)M_PI;
	else if (UNLIKELY(fac >=  1.0f)) return 0.0f;
	else                             return acosf(fac);
}

MINLINE float saasin(float fac)
{
	if      (UNLIKELY(fac <= -1.0f)) return (float)-M_PI / 2.0f;
	else if (UNLIKELY(fac >=  1.0f)) return (float) M_PI / 2.0f;
	else                             return asinf(fac);
}

MINLINE float sasqrt(float fac)
{
	if (UNLIKELY(fac <= 0.0f)) return 0.0f;
	else                       return sqrtf(fac);
}

MINLINE float saacosf(float fac)
{
	if      (UNLIKELY(fac <= -1.0f)) return (float)M_PI;
	else if (UNLIKELY(fac >=  1.0f)) return 0.0f;
	else                             return acosf(fac);
}

MINLINE float saasinf(float fac)
{
	if      (UNLIKELY(fac <= -1.0f)) return (float)-M_PI / 2.0f;
	else if (UNLIKELY(fac >=  1.0f)) return (float) M_PI / 2.0f;
	else                             return asinf(fac);
}

MINLINE float sasqrtf(float fac)
{
	if (UNLIKELY(fac <= 0.0f)) return 0.0f;
	else                       return sqrtf(fac);
}

MINLINE float interpf(float target, float origin, float fac)
{
	return (fac * target) + (1.0f - fac) * origin;
}

/* used for zoom values*/
MINLINE float power_of_2(float val)
{
	return (float)pow(2.0, ceil(log((double)val) / M_LN2));
}

MINLINE int is_power_of_2_i(int n)
{
	return (n & (n - 1)) == 0;
}

MINLINE int power_of_2_max_i(int n)
{
	if (is_power_of_2_i(n))
		return n;

	do {
		n = n & (n - 1);
	} while (!is_power_of_2_i(n));

	return n * 2;
}

MINLINE int power_of_2_min_i(int n)
{
	while (!is_power_of_2_i(n))
		n = n & (n - 1);

	return n;
}

MINLINE unsigned int power_of_2_max_u(unsigned int x)
{
	x -= 1;
	x |= (x >>  1);
	x |= (x >>  2);
	x |= (x >>  4);
	x |= (x >>  8);
	x |= (x >> 16);
	return x + 1;
}

MINLINE unsigned power_of_2_min_u(unsigned x)
{
	x |= (x >>  1);
	x |= (x >>  2);
	x |= (x >>  4);
	x |= (x >>  8);
	x |= (x >> 16);
	return x - (x >> 1);
}

MINLINE int iroundf(float a)
{
	return (int)floorf(a + 0.5f);
}

/* integer division that rounds 0.5 up, particularly useful for color blending
 * with integers, to avoid gradual darkening when rounding down */
MINLINE int divide_round_i(int a, int b)
{
	return (2 * a + b) / (2 * b);
}

/**
 * modulo that handles negative numbers, works the same as Python's.
 */
MINLINE int mod_i(int i, int n)
{
	return (i % n + n) % n;
}

MINLINE float min_ff(float a, float b)
{
	return (a < b) ? a : b;
}
MINLINE float max_ff(float a, float b)
{
	return (a > b) ? a : b;
}

MINLINE int min_ii(int a, int b)
{
	return (a < b) ? a : b;
}
MINLINE int max_ii(int a, int b)
{
	return (b < a) ? a : b;
}

MINLINE float min_fff(float a, float b, float c)
{
	return min_ff(min_ff(a, b), c);
}
MINLINE float max_fff(float a, float b, float c)
{
	return max_ff(max_ff(a, b), c);
}

MINLINE int min_iii(int a, int b, int c)
{
	return min_ii(min_ii(a, b), c);
}
MINLINE int max_iii(int a, int b, int c)
{
	return max_ii(max_ii(a, b), c);
}

MINLINE float min_ffff(float a, float b, float c, float d)
{
	return min_ff(min_fff(a, b, c), d);
}
MINLINE float max_ffff(float a, float b, float c, float d)
{
	return max_ff(max_fff(a, b, c), d);
}

MINLINE int min_iiii(int a, int b, int c, int d)
{
	return min_ii(min_iii(a, b, c), d);
}
MINLINE int max_iiii(int a, int b, int c, int d)
{
	return max_ii(max_iii(a, b, c), d);
}

/**
 * Almost-equal for IEEE floats, using absolute difference method.
 *
 * \param max_diff the maximum absolute difference.
 */
MINLINE int compare_ff(float a, float b, const float max_diff)
{
	return fabsf(a - b) <= max_diff;
}

/**
 * Almost-equal for IEEE floats, using their integer representation (mixing ULP and absolute difference methods).
 *
 * \param max_diff is the maximum absolute difference (allows to take care of the near-zero area,
 *                 where relative difference methods cannot really work).
 * \param max_ulps is the 'maximum number of floats + 1' allowed between \a a and \a b to consider them equal.
 *
 * \see https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
 */
MINLINE int compare_ff_relative(float a, float b, const float max_diff, const int max_ulps)
{
	union {float f; int i;} ua, ub;

#if 0  /* No BLI_assert in INLINE :/ */
	BLI_assert(sizeof(float) == sizeof(int));
	BLI_assert(max_ulps < (1 << 22));
#endif

	if (fabsf(a - b) <= max_diff) {
		return 1;
	}

	ua.f = a;
	ub.f = b;

	/* Important to compare sign from integers, since (-0.0f < 0) is false
	 * (though this shall not be an issue in common cases)... */
	return ((ua.i < 0) != (ub.i < 0)) ? 0 : (abs(ua.i - ub.i) <= max_ulps) ? 1 : 0;
}

MINLINE float signf(float f)
{
	return (f < 0.f) ? -1.f : 1.f;
}

MINLINE int signum_i_ex(float a, float eps)
{
	if (a >  eps) return  1;
	if (a < -eps) return -1;
	else          return  0;
}

MINLINE int signum_i(float a)
{
	if (a > 0.0f) return  1;
	if (a < 0.0f) return -1;
	else          return  0;
}

/* Internal helpers for SSE2 implementation.
 *
 * NOTE: Are to be called ONLY from inside `#ifdef __SSE2__` !!!
 */

#ifdef __SSE2__

/* Calculate initial guess for arg^exp based on float representation
 * This method gives a constant bias, which can be easily compensated by
 * multiplicating with bias_coeff.
 * Gives better results for exponents near 1 (e. g. 4/5).
 * exp = exponent, encoded as uint32_t
 * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as
 * uint32_t
 *
 * We hope that exp and e2coeff gets properly inlined
 */
MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp,
                                        const int e2coeff,
                                        const __m128 arg)
{
	__m128 ret;
	ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff)));
	ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
	ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp)));
	ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
	return ret;
}

/* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution(
    const __m128 old_result,
    const __m128 x)
{
	__m128 approx2 = _mm_mul_ps(old_result, old_result);
	__m128 approx4 = _mm_mul_ps(approx2, approx2);
	__m128 t = _mm_div_ps(x, approx4);
	__m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */
	return _mm_mul_ps(summ, _mm_set1_ps(1.0f / 5.0f));
}

/* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
{
	/* max, avg and |avg| errors were calculated in gcc without FMA instructions
	 * The final precision should be better than powf in glibc */

	/* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize
	 * avg error.
	 */
	/* 0x3F4CCCCD = 4/5 */
	/* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
	/* error max = 0.17	avg = 0.0018	|avg| = 0.05 */
	__m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg);
	__m128 arg2 = _mm_mul_ps(arg, arg);
	__m128 arg4 = _mm_mul_ps(arg2, arg2);
	/* error max = 0.018        avg = 0.0031    |avg| = 0.0031  */
	x = _bli_math_improve_5throot_solution(x, arg4);
	/* error max = 0.00021    avg = 1.6e-05    |avg| = 1.6e-05 */
	x = _bli_math_improve_5throot_solution(x, arg4);
	/* error max = 6.1e-07    avg = 5.2e-08    |avg| = 1.1e-07 */
	x = _bli_math_improve_5throot_solution(x, arg4);
	return _mm_mul_ps(x, _mm_mul_ps(x, x));
}

/* Calculate powf(x, 1.0f / 2.4) */
MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
{
	/* 5/12 is too small, so compute the 4th root of 20/12 instead.
	 * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
	 * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
	 */
	__m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
	__m128 xover = _mm_mul_ps(arg, xf);
	__m128 xfm1 = _mm_rsqrt_ps(xf);
	__m128 x2 = _mm_mul_ps(arg, arg);
	__m128 xunder = _mm_mul_ps(x2, xfm1);
	/* sqrt2 * over + 2 * sqrt2 * under */
	__m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
	                         _mm_add_ps(xover, xunder));
	xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
	xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
	return xavg;
}

MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask,
                                          const __m128 a,
                                          const __m128 b)
{
	return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
}

#endif  /* __SSE2__ */

#endif /* __MATH_BASE_INLINE_C__ */