1 files changed, 431 insertions, 429 deletions
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
index fe4e02197a4..872271666aa 100644
--- a/intern/cycles/util/util_math_fast.h
+++ b/intern/cycles/util/util_math_fast.h
@@ -49,18 +49,18 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline float madd(const float a, const float b, const float c)
 {
-	/* NOTE: In the future we may want to explicitly ask for a fused
-	 * multiply-add in a specialized version for float.
-	 *
-	 * NOTE: GCC/ICC will turn this (for float) into a FMA unless
-	 * explicitly asked not to, clang seems to leave the code alone.
-	 */
-	return a * b + c;
+  /* NOTE: In the future we may want to explicitly ask for a fused
+   * multiply-add in a specialized version for float.
+   *
+   * NOTE: GCC/ICC will turn this (for float) into a FMA unless
+   * explicitly asked not to, clang seems to leave the code alone.
+   */
+  return a * b + c;
 }
 
 ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
 {
-	return a * b + c;
+  return a * b + c;
 }
 
 /*
@@ -82,116 +82,117 @@ ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
 /* Round to nearest integer, returning as an int. */
 ccl_device_inline int fast_rint(float x)
 {
-	/* used by sin/cos/tan range reduction. */
+  /* used by sin/cos/tan range reduction. */
 #ifdef __KERNEL_SSE4__
-	/* Single roundps instruction on SSE4.1+ (for gcc/clang at least). */
-	return float_to_int(rintf(x));
+  /* Single roundps instruction on SSE4.1+ (for gcc/clang at least). */
+  return float_to_int(rintf(x));
 #else
-	/* emulate rounding by adding/substracting 0.5. */
-	return float_to_int(x + copysignf(0.5f, x));
+  /* emulate rounding by adding/substracting 0.5. */
+  return float_to_int(x + copysignf(0.5f, x));
 #endif
 }
 
 ccl_device float fast_sinf(float x)
 {
-	/* Very accurate argument reduction from SLEEF,
-	 * starts failing around x=262000
-	 *
-	 * Results on: [-2pi,2pi].
-	 *
-	 * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
-	 * 1.19209e-07 max error
-	 */
-	int q = fast_rint(x * M_1_PI_F);
-	float qf = q;
-	x = madd(qf, -0.78515625f*4, x);
-	x = madd(qf, -0.00024187564849853515625f*4, x);
-	x = madd(qf, -3.7747668102383613586e-08f*4, x);
-	x = madd(qf, -1.2816720341285448015e-12f*4, x);
-	x = M_PI_2_F - (M_PI_2_F - x);  /* Crush denormals */
-	float s = x * x;
-	if((q & 1) != 0) x = -x;
-	/* This polynomial approximation has very low error on [-pi/2,+pi/2]
-	 * 1.19209e-07 max error in total over [-2pi,+2pi]. */
-	float u = 2.6083159809786593541503e-06f;
-	u = madd(u, s, -0.0001981069071916863322258f);
-	u = madd(u, s, +0.00833307858556509017944336f);
-	u = madd(u, s, -0.166666597127914428710938f);
-	u = madd(s, u * x, x);
-	/* For large x, the argument reduction can fail and the polynomial can be
-	 * evaluated with arguments outside the valid internal. Just clamp the bad
-	 * values away (setting to 0.0f means no branches need to be generated). */
-	if(fabsf(u) > 1.0f) {
-		u = 0.0f;
-	}
-	return u;
+  /* Very accurate argument reduction from SLEEF,
+   * starts failing around x=262000
+   *
+   * Results on: [-2pi,2pi].
+   *
+   * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
+   * 1.19209e-07 max error
+   */
+  int q = fast_rint(x * M_1_PI_F);
+  float qf = q;
+  x = madd(qf, -0.78515625f * 4, x);
+  x = madd(qf, -0.00024187564849853515625f * 4, x);
+  x = madd(qf, -3.7747668102383613586e-08f * 4, x);
+  x = madd(qf, -1.2816720341285448015e-12f * 4, x);
+  x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals */
+  float s = x * x;
+  if ((q & 1) != 0)
+    x = -x;
+  /* This polynomial approximation has very low error on [-pi/2,+pi/2]
+   * 1.19209e-07 max error in total over [-2pi,+2pi]. */
+  float u = 2.6083159809786593541503e-06f;
+  u = madd(u, s, -0.0001981069071916863322258f);
+  u = madd(u, s, +0.00833307858556509017944336f);
+  u = madd(u, s, -0.166666597127914428710938f);
+  u = madd(s, u * x, x);
+  /* For large x, the argument reduction can fail and the polynomial can be
+   * evaluated with arguments outside the valid internal. Just clamp the bad
+   * values away (setting to 0.0f means no branches need to be generated). */
+  if (fabsf(u) > 1.0f) {
+    u = 0.0f;
+  }
+  return u;
 }
 
 ccl_device float fast_cosf(float x)
 {
-	/* Same argument reduction as fast_sinf(). */
-	int q = fast_rint(x * M_1_PI_F);
-	float qf = q;
-	x = madd(qf, -0.78515625f*4, x);
-	x = madd(qf, -0.00024187564849853515625f*4, x);
-	x = madd(qf, -3.7747668102383613586e-08f*4, x);
-	x = madd(qf, -1.2816720341285448015e-12f*4, x);
-	x = M_PI_2_F - (M_PI_2_F - x);  /* Crush denormals. */
-	float s = x * x;
-	/* Polynomial from SLEEF's sincosf, max error is
-	 * 4.33127e-07 over [-2pi,2pi] (98% of values are "exact"). */
-	float u = -2.71811842367242206819355e-07f;
-	u = madd(u, s, +2.47990446951007470488548e-05f);
-	u = madd(u, s, -0.00138888787478208541870117f);
-	u = madd(u, s, +0.0416666641831398010253906f);
-	u = madd(u, s, -0.5f);
-	u = madd(u, s, +1.0f);
-	if((q & 1) != 0) {
-		u = -u;
-	}
-	if(fabsf(u) > 1.0f) {
-		u = 0.0f;
-	}
-	return u;
+  /* Same argument reduction as fast_sinf(). */
+  int q = fast_rint(x * M_1_PI_F);
+  float qf = q;
+  x = madd(qf, -0.78515625f * 4, x);
+  x = madd(qf, -0.00024187564849853515625f * 4, x);
+  x = madd(qf, -3.7747668102383613586e-08f * 4, x);
+  x = madd(qf, -1.2816720341285448015e-12f * 4, x);
+  x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals. */
+  float s = x * x;
+  /* Polynomial from SLEEF's sincosf, max error is
+   * 4.33127e-07 over [-2pi,2pi] (98% of values are "exact"). */
+  float u = -2.71811842367242206819355e-07f;
+  u = madd(u, s, +2.47990446951007470488548e-05f);
+  u = madd(u, s, -0.00138888787478208541870117f);
+  u = madd(u, s, +0.0416666641831398010253906f);
+  u = madd(u, s, -0.5f);
+  u = madd(u, s, +1.0f);
+  if ((q & 1) != 0) {
+    u = -u;
+  }
+  if (fabsf(u) > 1.0f) {
+    u = 0.0f;
+  }
+  return u;
 }
 
-ccl_device void fast_sincosf(float x, float* sine, float* cosine)
+ccl_device void fast_sincosf(float x, float *sine, float *cosine)
 {
-	/* Same argument reduction as fast_sin. */
-	int q = fast_rint(x * M_1_PI_F);
-	float qf = q;
-	x = madd(qf, -0.78515625f*4, x);
-	x = madd(qf, -0.00024187564849853515625f*4, x);
-	x = madd(qf, -3.7747668102383613586e-08f*4, x);
-	x = madd(qf, -1.2816720341285448015e-12f*4, x);
-	x = M_PI_2_F - (M_PI_2_F - x); // crush denormals
-	float s = x * x;
-	/* NOTE: same exact polynomials as fast_sinf() and fast_cosf() above. */
-	if((q & 1) != 0) {
-		x = -x;
-	}
-	float su = 2.6083159809786593541503e-06f;
-	su = madd(su, s, -0.0001981069071916863322258f);
-	su = madd(su, s, +0.00833307858556509017944336f);
-	su = madd(su, s, -0.166666597127914428710938f);
-	su = madd(s, su * x, x);
-	float cu = -2.71811842367242206819355e-07f;
-	cu = madd(cu, s, +2.47990446951007470488548e-05f);
-	cu = madd(cu, s, -0.00138888787478208541870117f);
-	cu = madd(cu, s, +0.0416666641831398010253906f);
-	cu = madd(cu, s, -0.5f);
-	cu = madd(cu, s, +1.0f);
-	if((q & 1) != 0) {
-		cu = -cu;
-	}
-	if(fabsf(su) > 1.0f) {
-		su = 0.0f;
-	}
-	if(fabsf(cu) > 1.0f) {
-		cu = 0.0f;
-	}
-	*sine   = su;
-	*cosine = cu;
+  /* Same argument reduction as fast_sin. */
+  int q = fast_rint(x * M_1_PI_F);
+  float qf = q;
+  x = madd(qf, -0.78515625f * 4, x);
+  x = madd(qf, -0.00024187564849853515625f * 4, x);
+  x = madd(qf, -3.7747668102383613586e-08f * 4, x);
+  x = madd(qf, -1.2816720341285448015e-12f * 4, x);
+  x = M_PI_2_F - (M_PI_2_F - x);  // crush denormals
+  float s = x * x;
+  /* NOTE: same exact polynomials as fast_sinf() and fast_cosf() above. */
+  if ((q & 1) != 0) {
+    x = -x;
+  }
+  float su = 2.6083159809786593541503e-06f;
+  su = madd(su, s, -0.0001981069071916863322258f);
+  su = madd(su, s, +0.00833307858556509017944336f);
+  su = madd(su, s, -0.166666597127914428710938f);
+  su = madd(s, su * x, x);
+  float cu = -2.71811842367242206819355e-07f;
+  cu = madd(cu, s, +2.47990446951007470488548e-05f);
+  cu = madd(cu, s, -0.00138888787478208541870117f);
+  cu = madd(cu, s, +0.0416666641831398010253906f);
+  cu = madd(cu, s, -0.5f);
+  cu = madd(cu, s, +1.0f);
+  if ((q & 1) != 0) {
+    cu = -cu;
+  }
+  if (fabsf(su) > 1.0f) {
+    su = 0.0f;
+  }
+  if (fabsf(cu) > 1.0f) {
+    cu = 0.0f;
+  }
+  *sine = su;
+  *cosine = cu;
 }
 
 /* NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts
@@ -200,33 +201,33 @@ ccl_device void fast_sincosf(float x, float* sine, float* cosine)
  */
 ccl_device float fast_tanf(float x)
 {
-	/* Derived from SLEEF implementation.
-	 *
-	 * Note that we cannot apply the "denormal crush" trick everywhere because
-	 * we sometimes need to take the reciprocal of the polynomial
-	 */
-	int q = fast_rint(x * 2.0f * M_1_PI_F);
-	float qf = q;
-	x = madd(qf, -0.78515625f*2, x);
-	x = madd(qf, -0.00024187564849853515625f*2, x);
-	x = madd(qf, -3.7747668102383613586e-08f*2, x);
-	x = madd(qf, -1.2816720341285448015e-12f*2, x);
-	if((q & 1) == 0) {
-		/* Crush denormals (only if we aren't inverting the result later). */
-		x = M_PI_4_F - (M_PI_4_F - x);
-	}
-	float s = x * x;
-	float u = 0.00927245803177356719970703f;
-	u = madd(u, s, 0.00331984995864331722259521f);
-	u = madd(u, s, 0.0242998078465461730957031f);
-	u = madd(u, s, 0.0534495301544666290283203f);
-	u = madd(u, s, 0.133383005857467651367188f);
-	u = madd(u, s, 0.333331853151321411132812f);
-	u = madd(s, u * x, x);
-	if((q & 1) != 0) {
-		u = -1.0f / u;
-	}
-	return u;
+  /* Derived from SLEEF implementation.
+   *
+   * Note that we cannot apply the "denormal crush" trick everywhere because
+   * we sometimes need to take the reciprocal of the polynomial
+   */
+  int q = fast_rint(x * 2.0f * M_1_PI_F);
+  float qf = q;
+  x = madd(qf, -0.78515625f * 2, x);
+  x = madd(qf, -0.00024187564849853515625f * 2, x);
+  x = madd(qf, -3.7747668102383613586e-08f * 2, x);
+  x = madd(qf, -1.2816720341285448015e-12f * 2, x);
+  if ((q & 1) == 0) {
+    /* Crush denormals (only if we aren't inverting the result later). */
+    x = M_PI_4_F - (M_PI_4_F - x);
+  }
+  float s = x * x;
+  float u = 0.00927245803177356719970703f;
+  u = madd(u, s, 0.00331984995864331722259521f);
+  u = madd(u, s, 0.0242998078465461730957031f);
+  u = madd(u, s, 0.0534495301544666290283203f);
+  u = madd(u, s, 0.133383005857467651367188f);
+  u = madd(u, s, 0.333331853151321411132812f);
+  u = madd(s, u * x, x);
+  if ((q & 1) != 0) {
+    u = -1.0f / u;
+  }
+  return u;
 }
 
 /* Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
@@ -235,122 +236,119 @@ ccl_device float fast_tanf(float x)
  */
 ccl_device float fast_sinpif(float x)
 {
-	/* Fast trick to strip the integral part off, so our domain is [-1, 1]. */
-	const float z = x - ((x + 25165824.0f) - 25165824.0f);
-	const float y = z - z * fabsf(z);
-	const float Q = 3.10396624f;
-	const float P = 3.584135056f;  /* P = 16-4*Q */
-	return y * (Q + P * fabsf(y));
-
-	/* The original article used used inferior constants for Q and P and
-	 * so had max error 1.091e-3.
-	 *
-	 * The optimal value for Q was determined by exhaustive search, minimizing
-	 * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
-	 * over the interval [0,2] (which is where most of the invocations happen).
-	 *
-	 * The basic idea of this approximation starts with the coarse approximation:
-	 *      sin(pi*x) ~= f(x) =  4 * (x - x * abs(x))
-	 *
-	 * This approximation always _over_ estimates the target. On the other hand,
-	 * the curve:
-	 *      sin(pi*x) ~= f(x) * abs(f(x)) / 4
-	 *
-	 * always lies _under_ the target. Thus we can simply numerically search for
-	 * the optimal constant to LERP these curves into a more precise
-	 * approximation.
-	 *
-	 * After folding the constants together and simplifying the resulting math,
-	 * we end up with the compact implementation above.
-	 *
-	 * NOTE: this function actually computes sin(x * pi) which avoids one or two
-	 * mults in many cases and guarantees exact values at integer periods.
-	 */
+  /* Fast trick to strip the integral part off, so our domain is [-1, 1]. */
+  const float z = x - ((x + 25165824.0f) - 25165824.0f);
+  const float y = z - z * fabsf(z);
+  const float Q = 3.10396624f;
+  const float P = 3.584135056f; /* P = 16-4*Q */
+  return y * (Q + P * fabsf(y));
+
+  /* The original article used used inferior constants for Q and P and
+   * so had max error 1.091e-3.
+   *
+   * The optimal value for Q was determined by exhaustive search, minimizing
+   * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
+   * over the interval [0,2] (which is where most of the invocations happen).
+   *
+   * The basic idea of this approximation starts with the coarse approximation:
+   *      sin(pi*x) ~= f(x) =  4 * (x - x * abs(x))
+   *
+   * This approximation always _over_ estimates the target. On the other hand,
+   * the curve:
+   *      sin(pi*x) ~= f(x) * abs(f(x)) / 4
+   *
+   * always lies _under_ the target. Thus we can simply numerically search for
+   * the optimal constant to LERP these curves into a more precise
+   * approximation.
+   *
+   * After folding the constants together and simplifying the resulting math,
+   * we end up with the compact implementation above.
+   *
+   * NOTE: this function actually computes sin(x * pi) which avoids one or two
+   * mults in many cases and guarantees exact values at integer periods.
+   */
 }
 
 /* Fast approximate cos(x*M_PI) with ~0.1% absolute error. */
 ccl_device_inline float fast_cospif(float x)
 {
-	return fast_sinpif(x+0.5f);
+  return fast_sinpif(x + 0.5f);
 }
 
 ccl_device float fast_acosf(float x)
 {
-	const float f = fabsf(x);
-	/* clamp and crush denormals. */
-	const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
-	/* Based on http://www.pouet.net/topic.php?which=9132&page=2
-	 * 85% accurate (ulp 0)
-	 * Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
-	 * Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
-	 */
-	const float a = sqrtf(1.0f - m) *
-		(1.5707963267f + m * (-0.213300989f + m *
-		                      (0.077980478f + m * -0.02164095f)));
-	return x < 0 ? M_PI_F - a : a;
+  const float f = fabsf(x);
+  /* clamp and crush denormals. */
+  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
+  /* Based on http://www.pouet.net/topic.php?which=9132&page=2
+   * 85% accurate (ulp 0)
+   * Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
+   * Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
+   */
+  const float a = sqrtf(1.0f - m) *
+                  (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
+  return x < 0 ? M_PI_F - a : a;
 }
 
 ccl_device float fast_asinf(float x)
 {
-	/* Based on acosf approximation above.
-	 * Max error is 4.51133e-05 (ulps are higher because we are consistently off
-	 * by a little amount).
-	 */
-	const float f = fabsf(x);
-	/* Clamp and crush denormals. */
-	const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
-	const float a = M_PI_2_F - sqrtf(1.0f - m) *
-		(1.5707963267f + m * (-0.213300989f + m *
-		                      (0.077980478f + m * -0.02164095f)));
-	return copysignf(a, x);
+  /* Based on acosf approximation above.
+   * Max error is 4.51133e-05 (ulps are higher because we are consistently off
+   * by a little amount).
+   */
+  const float f = fabsf(x);
+  /* Clamp and crush denormals. */
+  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
+  const float a = M_PI_2_F -
+                  sqrtf(1.0f - m) * (1.5707963267f +
+                                     m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
+  return copysignf(a, x);
 }
 
 ccl_device float fast_atanf(float x)
 {
-	const float a = fabsf(x);
-	const float k = a > 1.0f ? 1 / a : a;
-	const float s = 1.0f - (1.0f - k);  /* Crush denormals. */
-	const float t = s * s;
-	/* http://mathforum.org/library/drmath/view/62672.html
-	 * Examined 4278190080 values of atan: 2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error      // (with  denormals)
-	 * Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
-	 */
-	float r = s * madd(0.43157974f, t, 1.0f) /
-	              madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
-	if(a > 1.0f) {
-		r = M_PI_2_F - r;
-	}
-	return copysignf(r, x);
+  const float a = fabsf(x);
+  const float k = a > 1.0f ? 1 / a : a;
+  const float s = 1.0f - (1.0f - k); /* Crush denormals. */
+  const float t = s * s;
+  /* http://mathforum.org/library/drmath/view/62672.html
+   * Examined 4278190080 values of atan: 2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error      // (with  denormals)
+   * Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
+   */
+  float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
+  if (a > 1.0f) {
+    r = M_PI_2_F - r;
+  }
+  return copysignf(r, x);
 }
 
 ccl_device float fast_atan2f(float y, float x)
 {
-	/* Based on atan approximation above.
-	 *
-	 * The special cases around 0 and infinity were tested explicitly.
-	 *
-	 * The only case not handled correctly is x=NaN,y=0 which returns 0 instead
-	 * of nan.
-	 */
-	const float a = fabsf(x);
-	const float b = fabsf(y);
-
-	const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
-	const float s = 1.0f - (1.0f - k);  /* Crush denormals */
-	const float t = s * s;
-
-	float r = s * madd(0.43157974f, t, 1.0f) /
-	              madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
-
-	if(b > a) {
-		/* Account for arg reduction. */
-		r = M_PI_2_F - r;
-	}
-	/* Test sign bit of x. */
-	if(__float_as_uint(x) & 0x80000000u) {
-		r = M_PI_F - r;
-	}
-	return copysignf(r, y);
+  /* Based on atan approximation above.
+   *
+   * The special cases around 0 and infinity were tested explicitly.
+   *
+   * The only case not handled correctly is x=NaN,y=0 which returns 0 instead
+   * of nan.
+   */
+  const float a = fabsf(x);
+  const float b = fabsf(y);
+
+  const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
+  const float s = 1.0f - (1.0f - k); /* Crush denormals */
+  const float t = s * s;
+
+  float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
+
+  if (b > a) {
+    /* Account for arg reduction. */
+    r = M_PI_2_F - r;
+  }
+  /* Test sign bit of x. */
+  if (__float_as_uint(x) & 0x80000000u) {
+    r = M_PI_F - r;
+  }
+  return copysignf(r, y);
 }
 
 /* Based on:
@@ -359,204 +357,207 @@ ccl_device float fast_atan2f(float y, float x)
  */
 ccl_device float fast_log2f(float x)
 {
-	/* NOTE: clamp to avoid special cases and make result "safe" from large
-	 * negative values/nans. */
-	x = clamp(x, FLT_MIN, FLT_MAX);
-	unsigned bits = __float_as_uint(x);
-	int exponent = (int)(bits >> 23) - 127;
-	float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
-	/* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
-	 * 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error.
-	 * ulp histogram:
-	 *  0  = 97.46%
-	 *  1  =  2.29%
-	 *  2  =  0.11%
-	 */
-	float f2 = f * f;
-	float f4 = f2 * f2;
-	float hi = madd(f, -0.00931049621349f,  0.05206469089414f);
-	float lo = madd(f,  0.47868480909345f, -0.72116591947498f);
-	hi = madd(f, hi, -0.13753123777116f);
-	hi = madd(f, hi,  0.24187369696082f);
-	hi = madd(f, hi, -0.34730547155299f);
-	lo = madd(f, lo,  1.442689881667200f);
-	return ((f4 * hi) + (f * lo)) + exponent;
+  /* NOTE: clamp to avoid special cases and make result "safe" from large
+   * negative values/nans. */
+  x = clamp(x, FLT_MIN, FLT_MAX);
+  unsigned bits = __float_as_uint(x);
+  int exponent = (int)(bits >> 23) - 127;
+  float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
+  /* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
+   * 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error.
+   * ulp histogram:
+   *  0  = 97.46%
+   *  1  =  2.29%
+   *  2  =  0.11%
+   */
+  float f2 = f * f;
+  float f4 = f2 * f2;
+  float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
+  float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
+  hi = madd(f, hi, -0.13753123777116f);
+  hi = madd(f, hi, 0.24187369696082f);
+  hi = madd(f, hi, -0.34730547155299f);
+  lo = madd(f, lo, 1.442689881667200f);
+  return ((f4 * hi) + (f * lo)) + exponent;
 }
 
 ccl_device_inline float fast_logf(float x)
 {
-	/* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
-	 * 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error.
-	 */
-	return fast_log2f(x) * M_LN2_F;
+  /* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
+   * 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error.
+   */
+  return fast_log2f(x) * M_LN2_F;
 }
 
 ccl_device_inline float fast_log10(float x)
 {
-	/* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
-	 * 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error.
-	 */
-	return fast_log2f(x) * M_LN2_F / M_LN10_F;
+  /* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
+   * 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error.
+   */
+  return fast_log2f(x) * M_LN2_F / M_LN10_F;
 }
 
 ccl_device float fast_logb(float x)
 {
-	/* Don't bother with denormals. */
-	x = fabsf(x);
-	x = clamp(x, FLT_MIN, FLT_MAX);
-	unsigned bits = __float_as_uint(x);
-	return (int)(bits >> 23) - 127;
+  /* Don't bother with denormals. */
+  x = fabsf(x);
+  x = clamp(x, FLT_MIN, FLT_MAX);
+  unsigned bits = __float_as_uint(x);
+  return (int)(bits >> 23) - 127;
 }
 
 ccl_device float fast_exp2f(float x)
 {
-	/* Clamp to safe range for final addition. */
-	x = clamp(x, -126.0f, 126.0f);
-	/* Range reduction. */
-	int m = (int)x; x -= m;
-	x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
-	/* 5th degree polynomial generated with sollya
-	 * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff,
-	 * 232 max ulp.
-	 *
-	 * ulp histogram:
-	 *  0  = 87.81%
-	 *  1  =  4.18%
-	 */
-	float r = 1.33336498402e-3f;
-	r = madd(x, r, 9.810352697968e-3f);
-	r = madd(x, r, 5.551834031939e-2f);
-	r = madd(x, r, 0.2401793301105f);
-	r = madd(x, r, 0.693144857883f);
-	r = madd(x, r, 1.0f);
-	/* Multiply by 2 ^ m by adding in the exponent. */
-	/* NOTE: left-shift of negative number is undefined behavior. */
-	return __uint_as_float(__float_as_uint(r) + ((unsigned)m << 23));
+  /* Clamp to safe range for final addition. */
+  x = clamp(x, -126.0f, 126.0f);
+  /* Range reduction. */
+  int m = (int)x;
+  x -= m;
+  x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
+  /* 5th degree polynomial generated with sollya
+   * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff,
+   * 232 max ulp.
+   *
+   * ulp histogram:
+   *  0  = 87.81%
+   *  1  =  4.18%
+   */
+  float r = 1.33336498402e-3f;
+  r = madd(x, r, 9.810352697968e-3f);
+  r = madd(x, r, 5.551834031939e-2f);
+  r = madd(x, r, 0.2401793301105f);
+  r = madd(x, r, 0.693144857883f);
+  r = madd(x, r, 1.0f);
+  /* Multiply by 2 ^ m by adding in the exponent. */
+  /* NOTE: left-shift of negative number is undefined behavior. */
+  return __uint_as_float(__float_as_uint(r) + ((unsigned)m << 23));
 }
 
 ccl_device_inline float fast_expf(float x)
 {
-	/* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
-	 * 2.6666452 avg ulp diff, 230 max ulp.
-	 */
-	return fast_exp2f(x / M_LN2_F);
+  /* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
+   * 2.6666452 avg ulp diff, 230 max ulp.
+   */
+  return fast_exp2f(x / M_LN2_F);
 }
 
 #ifndef __KERNEL_GPU__
 ccl_device float4 fast_exp2f4(float4 x)
 {
-	const float4 one = make_float4(1.0f);
-	const float4 limit = make_float4(126.0f);
-	x = clamp(x, -limit, limit);
-	int4 m = make_int4(x);
-	x = one - (one - (x - make_float4(m)));
-	float4 r = make_float4(1.33336498402e-3f);
-	r = madd4(x, r, make_float4(9.810352697968e-3f));
-	r = madd4(x, r, make_float4(5.551834031939e-2f));
-	r = madd4(x, r, make_float4(0.2401793301105f));
-	r = madd4(x, r, make_float4(0.693144857883f));
-	r = madd4(x, r, make_float4(1.0f));
-	return __int4_as_float4(__float4_as_int4(r) + (m << 23));
+  const float4 one = make_float4(1.0f);
+  const float4 limit = make_float4(126.0f);
+  x = clamp(x, -limit, limit);
+  int4 m = make_int4(x);
+  x = one - (one - (x - make_float4(m)));
+  float4 r = make_float4(1.33336498402e-3f);
+  r = madd4(x, r, make_float4(9.810352697968e-3f));
+  r = madd4(x, r, make_float4(5.551834031939e-2f));
+  r = madd4(x, r, make_float4(0.2401793301105f));
+  r = madd4(x, r, make_float4(0.693144857883f));
+  r = madd4(x, r, make_float4(1.0f));
+  return __int4_as_float4(__float4_as_int4(r) + (m << 23));
 }
 
 ccl_device_inline float4 fast_expf4(float4 x)
 {
-	return fast_exp2f4(x / M_LN2_F);
+  return fast_exp2f4(x / M_LN2_F);
 }
 #endif
 
 ccl_device_inline float fast_exp10(float x)
 {
-	/* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
-	 * 2.71732409 avg ulp diff, 232 max ulp.
-	 */
-	return fast_exp2f(x * M_LN10_F / M_LN2_F);
+  /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
+   * 2.71732409 avg ulp diff, 232 max ulp.
+   */
+  return fast_exp2f(x * M_LN10_F / M_LN2_F);
 }
 
 ccl_device_inline float fast_expm1f(float x)
 {
-	if(fabsf(x) < 1e-5f) {
-		x = 1.0f - (1.0f - x);  /* Crush denormals. */
-		return madd(0.5f, x * x, x);
-	}
-	else {
-		return fast_expf(x) - 1.0f;
-	}
+  if (fabsf(x) < 1e-5f) {
+    x = 1.0f - (1.0f - x); /* Crush denormals. */
+    return madd(0.5f, x * x, x);
+  }
+  else {
+    return fast_expf(x) - 1.0f;
+  }
 }
 
 ccl_device float fast_sinhf(float x)
 {
-	float a = fabsf(x);
-	if(a > 1.0f) {
-		/* Examined 53389559 values of sinh on [1,87.3300018]:
-		 * 33.6886442 avg ulp diff, 178 max ulp. */
-		float e = fast_expf(a);
-		return copysignf(0.5f * e - 0.5f / e, x);
-	}
-	else {
-		a = 1.0f - (1.0f - a);  /* Crush denorms. */
-		float a2 = a * a;
-		/* Degree 7 polynomial generated with sollya. */
-		/* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
-		float r = 2.03945513931e-4f;
-		r = madd(r, a2, 8.32990277558e-3f);
-		r = madd(r, a2, 0.1666673421859f);
-		r = madd(r * a, a2, a);
-		return copysignf(r, x);
-	}
+  float a = fabsf(x);
+  if (a > 1.0f) {
+    /* Examined 53389559 values of sinh on [1,87.3300018]:
+     * 33.6886442 avg ulp diff, 178 max ulp. */
+    float e = fast_expf(a);
+    return copysignf(0.5f * e - 0.5f / e, x);
+  }
+  else {
+    a = 1.0f - (1.0f - a); /* Crush denorms. */
+    float a2 = a * a;
+    /* Degree 7 polynomial generated with sollya. */
+    /* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
+    float r = 2.03945513931e-4f;
+    r = madd(r, a2, 8.32990277558e-3f);
+    r = madd(r, a2, 0.1666673421859f);
+    r = madd(r * a, a2, a);
+    return copysignf(r, x);
+  }
 }
 
 ccl_device_inline float fast_coshf(float x)
 {
-	/* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
-	 * 1.78256726 avg ulp diff, 178 max ulp.
-	 */
-	float e = fast_expf(fabsf(x));
-	return 0.5f * e + 0.5f / e;
+  /* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
+   * 1.78256726 avg ulp diff, 178 max ulp.
+   */
+  float e = fast_expf(fabsf(x));
+  return 0.5f * e + 0.5f / e;
 }
 
 ccl_device_inline float fast_tanhf(float x)
 {
-	/* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
-	 * 3.12924e-06 max error.
-	 */
-	/* NOTE: ulp error is high because of sub-optimal handling around the origin. */
-	float e = fast_expf(2.0f * fabsf(x));
-	return copysignf(1.0f - 2.0f / (1.0f + e), x);
+  /* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
+   * 3.12924e-06 max error.
+   */
+  /* NOTE: ulp error is high because of sub-optimal handling around the origin. */
+  float e = fast_expf(2.0f * fabsf(x));
+  return copysignf(1.0f - 2.0f / (1.0f + e), x);
 }
 
 ccl_device float fast_safe_powf(float x, float y)
 {
-	if(y == 0) return 1.0f;  /* x^1=1 */
-	if(x == 0) return 0.0f;  /* 0^y=0 */
-	float sign = 1.0f;
-	if(x < 0.0f) {
-		/* if x is negative, only deal with integer powers
-		 * powf returns NaN for non-integers, we will return 0 instead.
-		 */
-		int ybits = __float_as_int(y) & 0x7fffffff;
-		if(ybits >= 0x4b800000) {
-			// always even int, keep positive
-		}
-		else if(ybits >= 0x3f800000) {
-			/* Bigger than 1, check. */
-			int k = (ybits >> 23) - 127;  /* Get exponent. */
-			int j =  ybits >> (23 - k);   /* Shift out possible fractional bits. */
-			if((j << (23 - k)) == ybits) {  /* rebuild number and check for a match. */
-				/* +1 for even, -1 for odd. */
-				sign = __int_as_float(0x3f800000 | (j << 31));
-			}
-			else {
-				/* Not an integer. */
-				return 0.0f;
-			}
-		}
-		else {
-			/* Not an integer. */
-			return 0.0f;
-		}
-	}
-	return sign * fast_exp2f(y * fast_log2f(fabsf(x)));
+  if (y == 0)
+    return 1.0f; /* x^1=1 */
+  if (x == 0)
+    return 0.0f; /* 0^y=0 */
+  float sign = 1.0f;
+  if (x < 0.0f) {
+    /* if x is negative, only deal with integer powers
+     * powf returns NaN for non-integers, we will return 0 instead.
+     */
+    int ybits = __float_as_int(y) & 0x7fffffff;
+    if (ybits >= 0x4b800000) {
+      // always even int, keep positive
+    }
+    else if (ybits >= 0x3f800000) {
+      /* Bigger than 1, check. */
+      int k = (ybits >> 23) - 127;    /* Get exponent. */
+      int j = ybits >> (23 - k);      /* Shift out possible fractional bits. */
+      if ((j << (23 - k)) == ybits) { /* rebuild number and check for a match. */
+        /* +1 for even, -1 for odd. */
+        sign = __int_as_float(0x3f800000 | (j << 31));
+      }
+      else {
+        /* Not an integer. */
+        return 0.0f;
+      }
+    }
+    else {
+      /* Not an integer. */
+      return 0.0f;
+    }
+  }
+  return sign * fast_exp2f(y * fast_log2f(fabsf(x)));
 }
 
 /* TODO(sergey): Check speed  with our erf functions implementation from
@@ -565,74 +566,75 @@ ccl_device float fast_safe_powf(float x, float y)
 
 ccl_device_inline float fast_erff(float x)
 {
-	/* Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error. */
-	/* Abramowitz and Stegun, 7.1.28. */
-	const float a1 = 0.0705230784f;
-	const float a2 = 0.0422820123f;
-	const float a3 = 0.0092705272f;
-	const float a4 = 0.0001520143f;
-	const float a5 = 0.0002765672f;
-	const float a6 = 0.0000430638f;
-	const float a = fabsf(x);
-	if(a >= 12.3f) {
-		return copysignf(1.0f, x);
-	}
-	const float b = 1.0f - (1.0f - a);  /* Crush denormals. */
-	const float r = madd(madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
-	const float s = r * r;  /* ^2 */
-	const float t = s * s;  /* ^4 */
-	const float u = t * t;  /* ^8 */
-	const float v = u * u;  /* ^16 */
-	return copysignf(1.0f - 1.0f / v, x);
+  /* Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error. */
+  /* Abramowitz and Stegun, 7.1.28. */
+  const float a1 = 0.0705230784f;
+  const float a2 = 0.0422820123f;
+  const float a3 = 0.0092705272f;
+  const float a4 = 0.0001520143f;
+  const float a5 = 0.0002765672f;
+  const float a6 = 0.0000430638f;
+  const float a = fabsf(x);
+  if (a >= 12.3f) {
+    return copysignf(1.0f, x);
+  }
+  const float b = 1.0f - (1.0f - a); /* Crush denormals. */
+  const float r = madd(
+      madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
+  const float s = r * r; /* ^2 */
+  const float t = s * s; /* ^4 */
+  const float u = t * t; /* ^8 */
+  const float v = u * u; /* ^16 */
+  return copysignf(1.0f - 1.0f / v, x);
 }
 
 ccl_device_inline float fast_erfcf(float x)
 {
-	/* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
-	 *
-	 * ulp histogram:
-	 *
-	 *  0  = 80.30%
-	 */
-	return 1.0f - fast_erff(x);
+  /* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
+   *
+   * ulp histogram:
+   *
+   *  0  = 80.30%
+   */
+  return 1.0f - fast_erff(x);
 }
 
 ccl_device_inline float fast_ierff(float x)
 {
-	/* From: Approximating the erfinv function by Mike Giles. */
-	/* To avoid trouble at the limit, clamp input to 1-eps. */
-	float a = fabsf(x);
-	if(a > 0.99999994f) {
-		a = 0.99999994f;
-	}
-	float w = -fast_logf((1.0f - a) * (1.0f + a)), p;
-	if(w < 5.0f) {
-		w = w - 2.5f;
-		p =  2.81022636e-08f;
-		p = madd(p, w,  3.43273939e-07f);
-		p = madd(p, w, -3.5233877e-06f);
-		p = madd(p, w, -4.39150654e-06f);
-		p = madd(p, w,  0.00021858087f);
-		p = madd(p, w, -0.00125372503f);
-		p = madd(p, w, -0.00417768164f);
-		p = madd(p, w,  0.246640727f);
-		p = madd(p, w,  1.50140941f);
-	}
-	else {
-		w = sqrtf(w) - 3.0f;
-		p = -0.000200214257f;
-		p = madd(p, w,  0.000100950558f);
-		p = madd(p, w,  0.00134934322f);
-		p = madd(p, w, -0.00367342844f);
-		p = madd(p, w,  0.00573950773f);
-		p = madd(p, w, -0.0076224613f);
-		p = madd(p, w,  0.00943887047f);
-		p = madd(p, w,  1.00167406f);
-		p = madd(p, w,  2.83297682f);
-	}
-	return p * x;
+  /* From: Approximating the erfinv function by Mike Giles. */
+  /* To avoid trouble at the limit, clamp input to 1-eps. */
+  float a = fabsf(x);
+  if (a > 0.99999994f) {
+    a = 0.99999994f;
+  }
+  float w = -fast_logf((1.0f - a) * (1.0f + a)), p;
+  if (w < 5.0f) {
+    w = w - 2.5f;
+    p = 2.81022636e-08f;
+    p = madd(p, w, 3.43273939e-07f);
+    p = madd(p, w, -3.5233877e-06f);
+    p = madd(p, w, -4.39150654e-06f);
+    p = madd(p, w, 0.00021858087f);
+    p = madd(p, w, -0.00125372503f);
+    p = madd(p, w, -0.00417768164f);
+    p = madd(p, w, 0.246640727f);
+    p = madd(p, w, 1.50140941f);
+  }
+  else {
+    w = sqrtf(w) - 3.0f;
+    p = -0.000200214257f;
+    p = madd(p, w, 0.000100950558f);
+    p = madd(p, w, 0.00134934322f);
+    p = madd(p, w, -0.00367342844f);
+    p = madd(p, w, 0.00573950773f);
+    p = madd(p, w, -0.0076224613f);
+    p = madd(p, w, 0.00943887047f);
+    p = madd(p, w, 1.00167406f);
+    p = madd(p, w, 2.83297682f);
+  }
+  return p * x;
 }
 
 CCL_NAMESPACE_END
 
-#endif  /* __UTIL_FAST_MATH__ */
+#endif /* __UTIL_FAST_MATH__ */