From 31fbf2b74adf6f1b810915715614667aaf2e6f94 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Fri, 2 Dec 2016 12:15:24 +0100 Subject: Cycles: Implement AVX2 path for curve intersection functions Gives little performance improvement on Linux and gives up to 2% speedup on koro.blend on Windows. Inspired by Maxym Dmytrychenko, thanks! --- intern/cycles/kernel/geom/geom_curve.h | 66 +++++++++++++++++++++++++-- intern/cycles/kernel/geom/geom_motion_curve.h | 66 ++++++++++++++++++++++++++- 2 files changed, 127 insertions(+), 5 deletions(-) (limited to 'intern/cycles/kernel/geom') diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 84aaaab7453..636dbcc71e0 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -255,6 +255,17 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte int ka = max(k0 - 1, v00.x); int kb = min(k1 + 1, v00.x + v00.y - 1); +#ifdef __KERNEL_AVX2__ + avxf P_curve_0_1, P_curve_2_3; + if(type & PRIMITIVE_CURVE) { + P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); + P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); + } + else { + int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; + motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3); + } +#else /* __KERNEL_AVX2__ */ ssef P_curve[4]; if(type & PRIMITIVE_CURVE) { @@ -267,6 +278,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); } +#endif /* __KERNEL_AVX2__ */ ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; @@ -278,6 +290,33 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); +#ifdef __KERNEL_AVX2__ + const avxf vPP = _mm256_broadcast_ps(&P.m128); + const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); + const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); + const avxf htfm22 = avxf(htfm2.m128, htfm2.m128); + + const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP), + htfm00, + madd(shuffle<1>(P_curve_0_1 - vPP), + htfm11, + shuffle<2>(P_curve_0_1 - vPP) * htfm22)); + const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP), + htfm00, + madd(shuffle<1>(P_curve_2_3 - vPP), + htfm11, + shuffle<2>(P_curve_2_3 - vPP)*htfm22)); + + const ssef p0 = _mm256_castps256_ps128(p01); + const ssef p1 = _mm256_extractf128_ps(p01, 1); + const ssef p2 = _mm256_castps256_ps128(p23); + const ssef p3 = _mm256_extractf128_ps(p23, 1); + + const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1); + r_st = ((float4 &)P_curve_1).w; + const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3); + r_en = ((float4 &)P_curve_2).w; +#else /* __KERNEL_AVX2__ */ ssef htfm[] = { htfm0, htfm1, htfm2 }; ssef vP = load4f(P); ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); @@ -285,6 +324,10 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); + r_st = ((float4 &)P_curve[1]).w; + r_en = ((float4 &)P_curve[2]).w; +#endif /* __KERNEL_AVX2__ */ + float fc = 0.71f; ssef vfc = ssef(fc); ssef vfcxp3 = vfc * p3; @@ -294,8 +337,6 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); - r_st = ((float4 &)P_curve[1]).w; - r_en = ((float4 &)P_curve[2]).w; } #else float3 curve_coef[4]; @@ -383,8 +424,9 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte /* begin loop */ while(!(tree >> (depth))) { - float i_st = tree * resol; - float i_en = i_st + (level * resol); + const float i_st = tree * resol; + const float i_en = i_st + (level * resol); + #ifdef __KERNEL_SSE2__ ssef vi_st = ssef(i_st), vi_en = ssef(i_en); ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); @@ -458,13 +500,23 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte if(flags & CURVE_KN_RIBBONS) { float3 tg = (p_en - p_st); +#ifdef __KERNEL_SSE__ + const float3 tg_sq = tg * tg; + float w = tg_sq.x + tg_sq.y; +#else float w = tg.x * tg.x + tg.y * tg.y; +#endif if(w == 0) { tree++; level = tree & -tree; continue; } +#ifdef __KERNEL_SSE__ + const float3 p_sttg = p_st * tg; + w = -(p_sttg.x + p_sttg.y) / w; +#else w = -(p_st.x * tg.x + p_st.y * tg.y) / w; +#endif w = saturate(w); /* compute u on the curve segment */ @@ -496,7 +548,13 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte if(difl != 0.0f) { mw_extension = min(difl * fabsf(bmaxz), extmax); r_ext = mw_extension + r_curr; +#ifdef __KERNEL_SSE__ + const float3 p_curr_sq = p_curr * p_curr; + const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)); + float d = dxxx.x; +#else float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); +#endif float d0 = d - r_curr; float d1 = d + r_curr; float inv_mw_extension = 1.0f/mw_extension; diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h index 6de5aa7ea99..80b33fad68b 100644 --- a/intern/cycles/kernel/geom/geom_motion_curve.h +++ b/intern/cycles/kernel/geom/geom_motion_curve.h @@ -118,7 +118,12 @@ ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, in } /* return 2 curve key locations */ -ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object, int prim, float time, int k0, int k1, int k2, int k3, float4 keys[4]) +ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, + int object, + int prim, + float time, + int k0, int k1, int k2, int k3, + float4 keys[4]) { /* get motion info */ int numsteps, numkeys; @@ -147,6 +152,65 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object, keys[3] = (1.0f - t)*keys[3] + t*next_keys[3]; } +#ifdef __KERNEL_AVX2__ +/* Similar to above, but returns keys as pair of two AVX registers with each + * holding two float4. + */ +ccl_device_inline void motion_cardinal_curve_keys_avx(KernelGlobals *kg, + int object, + int prim, + float time, + int k0, int k1, + int k2, int k3, + avxf *out_keys_0_1, + avxf *out_keys_2_3) +{ + /* Get motion info. */ + int numsteps, numkeys; + object_motion_info(kg, object, &numsteps, NULL, &numkeys); + + /* Figure out which steps we need to fetch and their interpolation factor. */ + int maxstep = numsteps * 2; + int step = min((int)(time*maxstep), maxstep - 1); + float t = time*maxstep - step; + + /* Find attribute. */ + AttributeElement elem; + int offset = find_attribute_curve_motion(kg, + object, + ATTR_STD_MOTION_VERTEX_POSITION, + &elem); + kernel_assert(offset != ATTR_STD_NOT_FOUND); + + /* Fetch key coordinates. */ + float4 next_keys[4]; + float4 keys[4]; + motion_cardinal_curve_keys_for_step(kg, + offset, + numkeys, + numsteps, + step, + k0, k1, k2, k3, + keys); + motion_cardinal_curve_keys_for_step(kg, + offset, + numkeys, + numsteps, + step + 1, + k0, k1, k2, k3, + next_keys); + + const avxf keys_0_1 = avxf(keys[0].m128, keys[1].m128); + const avxf keys_2_3 = avxf(keys[2].m128, keys[3].m128); + const avxf next_keys_0_1 = avxf(next_keys[0].m128, next_keys[1].m128); + const avxf next_keys_2_3 = avxf(next_keys[2].m128, next_keys[3].m128); + + /* Interpolate between steps. */ + *out_keys_0_1 = (1.0f - t) * keys_0_1 + t*next_keys_0_1; + *out_keys_2_3 = (1.0f - t) * keys_2_3 + t*next_keys_2_3; +} +#endif + #endif CCL_NAMESPACE_END -- cgit v1.2.3