Cycles: avoid 1.0f/(1.0f/x) divisions, which msvc (only) can't optimize.

This makes bmw scene in msvc 12 builds 6% faster. It also gives a minor speedup for SSE hair in all compilers.
author: Sv. Lockal <lockalsash@gmail.com> 2014-04-03 22:08:53 +0400
committer: Sv. Lockal <lockalsash@gmail.com> 2014-04-03 22:08:53 +0400
commit: e7c2578576380288befcd77e88edd8ae508ed01a (patch)
tree: 68b90ac3af5af9d0c38b7e8e5cd60f7edc2ae497 /intern/cycles/kernel/geom/geom_curve.h
parent: 5e5ec4c138de49005ea711d280e3e18794c9473d (diff)
1 files changed, 7 insertions, 9 deletions
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index b508f5045c1..e57bcd894a6 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -205,12 +205,12 @@ ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a)
 #endif
 
 #ifdef __KERNEL_SSE2__
-/* Pass P and idir by reference to aligned vector */
+/* Pass P and dir by reference to aligned vector */
 ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
+	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 #else
 ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 idir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
+	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
 #endif
 {
 	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
@@ -222,7 +222,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 	int prim = kernel_tex_fetch(__prim_index, curveAddr);
 
 #ifdef __KERNEL_SSE2__
-	__m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
+	__m128 vdir = load_m128(dir);
 	__m128 vcurve_coef[4];
 	const float3 *curve_coef = (float3 *)vcurve_coef;
 	
@@ -285,8 +285,6 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 	float3 curve_coef[4];
 
 	/* curve Intersection check */
-	float3 dir = 1.0f/idir;
-
 	/* obtain curve parameters */
 	{
 		/* ray transform created - this should be created at beginning of intersection loop */
@@ -597,7 +595,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 }
 
 ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
+	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 {
 	/* define few macros to minimize code duplication for SSE */
 #ifndef __KERNEL_SSE2__
@@ -647,9 +645,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	}
 	/* --- */
 
-	float3 dir = 1.0f / idir;
 	float3 p21_diff = p2 - p1;
 	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
+	float3 dir = direction;
 	float sphere_b_tmp = dot3(dir, sphere_dif1);
 	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
 #else
@@ -680,9 +678,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12));
 	float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12));
 
-	const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
 	const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]);
 	const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f));
+	const __m128 dir = load_m128(direction);
 	const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1);
 	const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1);
 #endif
author	Sv. Lockal <lockalsash@gmail.com>	2014-04-03 22:08:53 +0400
committer	Sv. Lockal <lockalsash@gmail.com>	2014-04-03 22:08:53 +0400
commit	e7c2578576380288befcd77e88edd8ae508ed01a (patch)
tree	68b90ac3af5af9d0c38b7e8e5cd60f7edc2ae497 /intern/cycles/kernel/geom/geom_curve.h
parent	5e5ec4c138de49005ea711d280e3e18794c9473d (diff)