From a9bb8d8a73cad2edd96297c868ae3a3ab4b04e27 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Sun, 5 Apr 2015 22:13:10 +0500 Subject: Cycles: de-duplicate fast/approximate erf function calculation Our own implementation is in fact the same performance as in fast_math from OpenShadingLanguage, but implementation from fast_math is using explicit madd function, which increases chance of compiler deciding to use intrinsics. --- intern/cycles/kernel/closure/bsdf_microfacet.h | 81 ++------------------------ intern/cycles/util/util_math_fast.h | 4 +- 2 files changed, 8 insertions(+), 77 deletions(-) (limited to 'intern/cycles') diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index b225e674702..32cf36abd27 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -35,75 +35,6 @@ CCL_NAMESPACE_BEGIN -/* Approximate erf and erfinv implementations. - * Implementation comes straight from Wikipedia: - * - * http://en.wikipedia.org/wiki/Error_function - * - * Some constants are baked into the code. - */ - -ccl_device_inline float approx_erff_do(float x) -{ - /* Such a clamp doesn't give much distortion to the output value - * and gives quite a few of the speedup. - */ - if(x > 3.0f) { - return 1.0f; - } - float t = 1.0f / (1.0f + 0.47047f*x); - return (1.0f - - t*(0.3480242f + t*(-0.0958798f + t*0.7478556f)) * expf(-x*x)); -} - -ccl_device_inline float approx_erff(float x) -{ - if(x >= 0.0f) { - return approx_erff_do(x); - } - else { - return -approx_erff_do(-x); - } -} - -ccl_device_inline float approx_erfinvf_do(float x) -{ - if(x <= 0.7f) { - const float x2 = x * x; - const float a1 = 0.886226899f; - const float a2 = -1.645349621f; - const float a3 = 0.914624893f; - const float a4 = -0.140543331f; - const float b1 = -2.118377725f; - const float b2 = 1.442710462f; - const float b3 = -0.329097515f; - const float b4 = 0.012229801f; - return x * (((a4 * x2 + a3) * x2 + a2) * x2 + a1) / - ((((b4 * x2 + b3) * x2 + b2) * x2 + b1) * x2 + 1.0f); - } - else { - const float c1 = -1.970840454f; - const float c2 = -1.624906493f; - const float c3 = 3.429567803f; - const float c4 = 1.641345311f; - const float d1 = 3.543889200f; - const float d2 = 1.637067800f; - const float z = sqrtf(-logf((1.0f - x) * 0.5f)); - return (((c4 * z + c3) * z + c2) * z + c1) / - ((d2 * z + d1) * z + 1.0f); - } -} - -ccl_device_inline float approx_erfinvf(float x) -{ - if(x >= 0.0f) { - return approx_erfinvf_do(x); - } - else { - return -approx_erfinvf_do(-x); - } -} - /* Beckmann and GGX microfacet importance sampling. */ ccl_device_inline void microfacet_beckmann_sample_slopes( @@ -126,7 +57,7 @@ ccl_device_inline void microfacet_beckmann_sample_slopes( const float tan_theta_i = sin_theta_i/cos_theta_i; const float inv_a = tan_theta_i; const float cot_theta_i = 1.0f/tan_theta_i; - const float erf_a = approx_erff(cot_theta_i); + const float erf_a = fast_erff(cot_theta_i); const float exp_a2 = expf(-cot_theta_i*cot_theta_i); const float SQRT_PI_INV = 0.56418958354f; const float Lambda = 0.5f*(erf_a - 1.0f) + (0.5f*SQRT_PI_INV)*(exp_a2*inv_a); @@ -156,31 +87,31 @@ ccl_device_inline void microfacet_beckmann_sample_slopes( float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f; /* Perform newton step to refine toward the true root. */ - float inv_erf = approx_erfinvf(b); + float inv_erf = fast_ierff(b); float value = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact; /* Check if we are close enough already, * this also avoids NaNs as we get close to the root. */ if(fabsf(value) > 1e-6f) { b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 1. */ - inv_erf = approx_erfinvf(b); + inv_erf = fast_ierff(b); value = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact; b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 2. */ /* Compute the slope from the refined value. */ - *slope_x = approx_erfinvf(b); + *slope_x = fast_ierff(b); } else { /* We are close enough already. */ *slope_x = inv_erf; } - *slope_y = approx_erfinvf(2.0f*randv - 1.0f); + *slope_y = fast_ierff(2.0f*randv - 1.0f); #else /* Use precomputed table on CPU, it gives better perfomance. */ int beckmann_table_offset = kernel_data.tables.beckmann_offset; *slope_x = lookup_table_read_2D(kg, randu, cos_theta_i, beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE); - *slope_y = approx_erfinvf(2.0f*randv - 1.0f); + *slope_y = fast_ierff(2.0f*randv - 1.0f); #endif } diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h index 4ad81e9c015..1acceee22a5 100644 --- a/intern/cycles/util/util_math_fast.h +++ b/intern/cycles/util/util_math_fast.h @@ -539,7 +539,7 @@ ccl_device float fast_safe_powf(float x, float y) * bsdf_microfaset.h. */ -ccl_device float fast_erff(float x) +ccl_device_inline float fast_erff(float x) { /* Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error. */ /* Abramowitz and Stegun, 7.1.28. */ @@ -570,7 +570,7 @@ ccl_device_inline float fast_erfcf(float x) return 1.0f - fast_erff(x); } -ccl_device float fast_ierff(float x) +ccl_device_inline float fast_ierff(float x) { /* From: Approximating the erfinv function by Mike Giles. */ /* To avoid trouble at the limit, clamp input to 1-eps. */ -- cgit v1.2.3