diff options
author | Sv. Lockal <lockalsash@gmail.com> | 2014-04-03 22:08:53 +0400 |
---|---|---|
committer | Sv. Lockal <lockalsash@gmail.com> | 2014-04-03 22:08:53 +0400 |
commit | e7c2578576380288befcd77e88edd8ae508ed01a (patch) | |
tree | 68b90ac3af5af9d0c38b7e8e5cd60f7edc2ae497 /intern | |
parent | 5e5ec4c138de49005ea711d280e3e18794c9473d (diff) |
Cycles: avoid 1.0f/(1.0f/x) divisions, which msvc (only) can't optimize.
This makes bmw scene in msvc 12 builds 6% faster.
It also gives a minor speedup for SSE hair in all compilers.
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_subsurface.h | 15 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_traversal.h | 23 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_curve.h | 16 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_motion_triangle.h | 10 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_object.h | 47 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_triangle.h | 6 |
6 files changed, 55 insertions, 62 deletions
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h index ae4641d257a..6b71ffc24ba 100644 --- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h +++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h @@ -50,7 +50,8 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio /* ray parameters in registers */ const float tmax = ray->t; float3 P = ray->P; - float3 idir = bvh_inverse_direction(ray->D); + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); int object = OBJECT_NONE; float isect_t = tmax; @@ -215,11 +216,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { - triangle_intersect_subsurface(kg, isect_array, P, idir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); + triangle_intersect_subsurface(kg, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); break; } case PRIMITIVE_MOTION_TRIANGLE: { - motion_triangle_intersect_subsurface(kg, isect_array, P, idir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); + motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); break; } default: { @@ -235,9 +236,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio object = subsurface_object; #if FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax); + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm, tmax); #else - bvh_instance_push(kg, object, ray, &P, &idir, &isect_t, tmax); + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t, tmax); #endif #if defined(__KERNEL_SSE2__) @@ -271,9 +272,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio /* instance pop */ #if FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm, tmax); #else - bvh_instance_pop(kg, object, ray, &P, &idir, &isect_t, tmax); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t, tmax); #endif #if defined(__KERNEL_SSE2__) diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h index 153efe3932c..566aa421474 100644 --- a/intern/cycles/kernel/geom/geom_bvh_traversal.h +++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h @@ -55,7 +55,8 @@ ccl_device bool BVH_FUNCTION_NAME /* ray parameters in registers */ const float tmax = ray->t; float3 P = ray->P; - float3 idir = bvh_inverse_direction(ray->D); + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); int object = OBJECT_NONE; #if FEATURE(BVH_MOTION) @@ -253,11 +254,11 @@ ccl_device bool BVH_FUNCTION_NAME switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, isect, P, idir, visibility, object, primAddr); + hit = triangle_intersect(kg, isect, P, dir, visibility, object, primAddr); break; } case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect(kg, isect, P, idir, ray->time, visibility, object, primAddr); + hit = motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr); break; } #if FEATURE(BVH_HAIR) @@ -265,14 +266,14 @@ ccl_device bool BVH_FUNCTION_NAME case PRIMITIVE_MOTION_CURVE: { #if FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); + hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); else - hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); + hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); #else if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, ray->time, type); + hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type); else - hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, ray->time, type); + hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type); #endif break; @@ -306,9 +307,9 @@ ccl_device bool BVH_FUNCTION_NAME object = kernel_tex_fetch(__prim_object, -primAddr-1); #if FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax); + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm, tmax); #else - bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax); + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, tmax); #endif #if defined(__KERNEL_SSE2__) @@ -336,9 +337,9 @@ ccl_device bool BVH_FUNCTION_NAME /* instance pop */ #if FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm, tmax); #else - bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t, tmax); #endif #if defined(__KERNEL_SSE2__) diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index b508f5045c1..e57bcd894a6 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -205,12 +205,12 @@ ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a) #endif #ifdef __KERNEL_SSE2__ -/* Pass P and idir by reference to aligned vector */ +/* Pass P and dir by reference to aligned vector */ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) + const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) #else ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 idir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) + float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) #endif { int segment = PRIMITIVE_UNPACK_SEGMENT(type); @@ -222,7 +222,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect int prim = kernel_tex_fetch(__prim_index, curveAddr); #ifdef __KERNEL_SSE2__ - __m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir)); + __m128 vdir = load_m128(dir); __m128 vcurve_coef[4]; const float3 *curve_coef = (float3 *)vcurve_coef; @@ -285,8 +285,6 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect float3 curve_coef[4]; /* curve Intersection check */ - float3 dir = 1.0f/idir; - /* obtain curve parameters */ { /* ray transform created - this should be created at beginning of intersection loop */ @@ -597,7 +595,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect } ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 idir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) + float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) { /* define few macros to minimize code duplication for SSE */ #ifndef __KERNEL_SSE2__ @@ -647,9 +645,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec } /* --- */ - float3 dir = 1.0f / idir; float3 p21_diff = p2 - p1; float3 sphere_dif1 = (dif + dif_second) * 0.5f; + float3 dir = direction; float sphere_b_tmp = dot3(dir, sphere_dif1); float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; #else @@ -680,9 +678,9 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12)); float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12)); - const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir)); const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]); const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f)); + const __m128 dir = load_m128(direction); const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1); const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1); #endif diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index c5eb0974238..73338bb6b3b 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -313,7 +313,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD * time and do a ray intersection with the resulting triangle */ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 idir, float time, uint visibility, int object, int triAddr) + float3 P, float3 dir, float time, uint visibility, int object, int triAddr) { /* primitive index for vertex location lookup */ int prim = kernel_tex_fetch(__prim_index, triAddr); @@ -324,10 +324,9 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection motion_triangle_vertices(kg, fobject, prim, time, verts); /* ray-triangle intersection, unoptimized */ - float3 D = 1.0f/idir; float t, u, v; - if(ray_triangle_intersect_uv(P, D, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) { + if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) { isect->prim = triAddr; isect->object = object; isect->type = PRIMITIVE_MOTION_TRIANGLE; @@ -347,7 +346,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection #ifdef __SUBSURFACE__ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array, - float3 P, float3 idir, float time, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits) + float3 P, float3 dir, float time, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits) { /* primitive index for vertex location lookup */ int prim = kernel_tex_fetch(__prim_index, triAddr); @@ -358,10 +357,9 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I motion_triangle_vertices(kg, fobject, prim, time, verts); /* ray-triangle intersection, unoptimized */ - float3 D = 1.0f/idir; float t, u, v; - if(ray_triangle_intersect_uv(P, D, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) { + if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) { (*num_hits)++; int hit; diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index 3be8a71ca83..71ad4a55088 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -361,33 +361,31 @@ ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle) /* Object intersection in BVH */ -ccl_device_inline float3 bvh_inverse_direction(float3 dir) +ccl_device_inline float3 bvh_clamp_direction(float3 dir) { - /* avoid divide by zero (ooeps = exp2f(-80.0f)) */ - float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f; - float3 idir; - - idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x)); - idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y)); - idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z)); + /* clamp absolute values by exp2f(-80.0f) to avoid division by zero when calculating inverse direction */ + float ooeps = 8.271806E-25; + return make_float3((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x), + (fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y), + (fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z)); +} - return idir; +ccl_device_inline float3 bvh_inverse_direction(float3 dir) +{ + return 1.0f / dir; } /* Transform ray into object space to enter static object in BVH */ -ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) +ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, const float tmax) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); *P = transform_point(&tfm, ray->P); - float3 dir = transform_direction(&tfm, ray->D); - float len; - dir = normalize_len(dir, &len); - - *idir = bvh_inverse_direction(dir); + *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len)); + *idir = bvh_inverse_direction(*dir); if(*t != FLT_MAX) *t *= len; @@ -395,7 +393,7 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra /* Transorm ray to exit static object in BVH */ -ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) +ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, const float tmax) { if(*t != FLT_MAX) { Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); @@ -403,25 +401,23 @@ ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray } *P = ray->P; - *idir = bvh_inverse_direction(ray->D); + *dir = bvh_clamp_direction(ray->D); + *idir = bvh_inverse_direction(*dir); } #ifdef __OBJECT_MOTION__ /* Transform ray into object space to enter motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax) +ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm, const float tmax) { Transform itfm; *tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm); *P = transform_point(&itfm, ray->P); - float3 dir = transform_direction(&itfm, ray->D); - float len; - dir = normalize_len(dir, &len); - - *idir = bvh_inverse_direction(dir); + *dir = bvh_clamp_direction(normalize_len(transform_direction(&itfm, ray->D), &len)); + *idir = bvh_inverse_direction(*dir); if(*t != FLT_MAX) *t *= len; @@ -429,13 +425,14 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, c /* Transorm ray to exit motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax) +ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm, const float tmax) { if(*t != FLT_MAX) *t *= len(transform_direction(tfm, 1.0f/(*idir))); *P = ray->P; - *idir = bvh_inverse_direction(ray->D); + *dir = bvh_clamp_direction(ray->D); + *idir = bvh_inverse_direction(*dir); } #endif diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 3fdf9e8a7cc..355e36fef0c 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -269,12 +269,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData * Based on Sven Woop's algorithm with precomputed triangle storage */ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 idir, uint visibility, int object, int triAddr) + float3 P, float3 dir, uint visibility, int object, int triAddr) { /* compute and check intersection t-value */ float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0); float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1); - float3 dir = 1.0f/idir; float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z); @@ -322,12 +321,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect #ifdef __SUBSURFACE__ ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array, - float3 P, float3 idir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits) + float3 P, float3 dir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits) { /* compute and check intersection t-value */ float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0); float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1); - float3 dir = 1.0f/idir; float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z); |