diff options
author | Sv. Lockal <lockalsash@gmail.com> | 2014-04-03 22:08:53 +0400 |
---|---|---|
committer | Sv. Lockal <lockalsash@gmail.com> | 2014-04-03 22:08:53 +0400 |
commit | e7c2578576380288befcd77e88edd8ae508ed01a (patch) | |
tree | 68b90ac3af5af9d0c38b7e8e5cd60f7edc2ae497 /intern/cycles/kernel/geom/geom_curve.h | |
parent | 5e5ec4c138de49005ea711d280e3e18794c9473d (diff) |
Cycles: avoid 1.0f/(1.0f/x) divisions, which msvc (only) can't optimize.
This makes bmw scene in msvc 12 builds 6% faster.
It also gives a minor speedup for SSE hair in all compilers.
Diffstat (limited to 'intern/cycles/kernel/geom/geom_curve.h')
-rw-r--r-- | intern/cycles/kernel/geom/geom_curve.h | 16 |
1 files changed, 7 insertions, 9 deletions
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index b508f5045c1..e57bcd894a6 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -205,12 +205,12 @@ ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a) #endif #ifdef __KERNEL_SSE2__ -/* Pass P and idir by reference to aligned vector */ +/* Pass P and dir by reference to aligned vector */ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) + const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) #else ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 idir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) + float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) #endif { int segment = PRIMITIVE_UNPACK_SEGMENT(type); @@ -222,7 +222,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect int prim = kernel_tex_fetch(__prim_index, curveAddr); #ifdef __KERNEL_SSE2__ - __m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir)); + __m128 vdir = load_m128(dir); __m128 vcurve_coef[4]; const float3 *curve_coef = (float3 *)vcurve_coef; @@ -285,8 +285,6 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect float3 curve_coef[4]; /* curve Intersection check */ - float3 dir = 1.0f/idir; - /* obtain curve parameters */ { /* ray transform created - this should be created at beginning of intersection loop */ @@ -597,7 +595,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect } ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) + float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) { /* define few macros to minimize code duplication for SSE */ #ifndef __KERNEL_SSE2__ @@ -647,9 +645,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec } /* --- */ - float3 dir = 1.0f / idir; float3 p21_diff = p2 - p1; float3 sphere_dif1 = (dif + dif_second) * 0.5f; + float3 dir = direction; float sphere_b_tmp = dot3(dir, sphere_dif1); float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; #else @@ -680,9 +678,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12)); float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12)); - const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir)); const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]); const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f)); + const __m128 dir = load_m128(direction); const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1); const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1); #endif |