Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSv. Lockal <lockalsash@gmail.com>2014-04-03 22:08:53 +0400
committerSv. Lockal <lockalsash@gmail.com>2014-04-03 22:08:53 +0400
commite7c2578576380288befcd77e88edd8ae508ed01a (patch)
tree68b90ac3af5af9d0c38b7e8e5cd60f7edc2ae497 /intern/cycles/kernel/geom/geom_curve.h
parent5e5ec4c138de49005ea711d280e3e18794c9473d (diff)
Cycles: avoid 1.0f/(1.0f/x) divisions, which msvc (only) can't optimize.
This makes bmw scene in msvc 12 builds 6% faster. It also gives a minor speedup for SSE hair in all compilers.
Diffstat (limited to 'intern/cycles/kernel/geom/geom_curve.h')
-rw-r--r--intern/cycles/kernel/geom/geom_curve.h16
1 files changed, 7 insertions, 9 deletions
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index b508f5045c1..e57bcd894a6 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -205,12 +205,12 @@ ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a)
#endif
#ifdef __KERNEL_SSE2__
-/* Pass P and idir by reference to aligned vector */
+/* Pass P and dir by reference to aligned vector */
ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
- const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
+ const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
#else
ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
- float3 P, float3 idir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
+ float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
#endif
{
int segment = PRIMITIVE_UNPACK_SEGMENT(type);
@@ -222,7 +222,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
int prim = kernel_tex_fetch(__prim_index, curveAddr);
#ifdef __KERNEL_SSE2__
- __m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
+ __m128 vdir = load_m128(dir);
__m128 vcurve_coef[4];
const float3 *curve_coef = (float3 *)vcurve_coef;
@@ -285,8 +285,6 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
float3 curve_coef[4];
/* curve Intersection check */
- float3 dir = 1.0f/idir;
-
/* obtain curve parameters */
{
/* ray transform created - this should be created at beginning of intersection loop */
@@ -597,7 +595,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
}
ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
- float3 P, float3 idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
+ float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
{
/* define few macros to minimize code duplication for SSE */
#ifndef __KERNEL_SSE2__
@@ -647,9 +645,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
}
/* --- */
- float3 dir = 1.0f / idir;
float3 p21_diff = p2 - p1;
float3 sphere_dif1 = (dif + dif_second) * 0.5f;
+ float3 dir = direction;
float sphere_b_tmp = dot3(dir, sphere_dif1);
float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
#else
@@ -680,9 +678,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12));
float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12));
- const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]);
const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f));
+ const __m128 dir = load_m128(direction);
const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1);
const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1);
#endif