Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2016-10-12 14:46:25 +0300
committerSergey Sharybin <sergey.vfx@gmail.com>2016-10-12 15:11:55 +0300
commit42aeb608e75ec976c0bb3d91ca14b49371e43e6d (patch)
tree40af75276d105721ffafdc4d7c77d2e8d14aa7ed /intern/cycles/kernel/geom/geom_triangle_intersect.h
parent6a4ec3ca43b3aaade29a3642f3c6a6138b89e4b8 (diff)
Cycles: Implement AVX2 version of triangle_intersect
This commit basically vectorizes existing code using AVX2 instructions (without modifying algorithm itself). This gives quite nice speedups: BMW: -8% Classroom: -5% Cat: -5% Koro: +1% Barcelona: -8% That's on Linux machine, reported performance improvement on Windows goes up to 20%. Not currently sure why Koro is somewhat slower because it mainly uses curve intersection tests, could be a time noise? Or osmething with the cache utilization perhaps? In any case speedup in other scenes makes me thinking that current state is acceptable for initial implementation. This is again inspired by Maxym Dmytrychenko.
Diffstat (limited to 'intern/cycles/kernel/geom/geom_triangle_intersect.h')
-rw-r--r--intern/cycles/kernel/geom/geom_triangle_intersect.h62
1 files changed, 62 insertions, 0 deletions
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index dd5328220ab..b505bd54e5e 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -107,6 +107,67 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
/* Calculate vertices relative to ray origin. */
const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
+
+#if defined(__KERNEL_AVX2__)
+ const avxf avxf_P(P.m128, P.m128);
+
+ const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
+ const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
+
+ const avxf AB = tri_ab - avxf_P;
+ const avxf BC = tri_bc - avxf_P;
+
+ const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
+
+ const avxf AB_k = shuffle(AB, permuteMask);
+ const avxf BC_k = shuffle(BC, permuteMask);
+
+ /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
+ const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
+
+ /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
+ const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
+
+ const avxf Sxy(Sy, Sx, Sy, Sx);
+
+ /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
+ const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
+
+ float ABBC_kz_array[8];
+ _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
+
+ const float A_kz = ABBC_kz_array[0];
+ const float B_kz = ABBC_kz_array[2];
+ const float C_kz = ABBC_kz_array[6];
+
+ /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
+ const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
+
+ const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
+
+ /* W U V
+ * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
+ */
+ const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
+
+ const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
+
+ /* Calculate scaled barycentric coordinates. */
+ float WUVW_array[4];
+ _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
+
+ const float W = WUVW_array[0];
+ const float U = WUVW_array[1];
+ const float V = WUVW_array[2];
+
+ const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
+ const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
+ _mm256_setzero_ps(), 0));
+
+ if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
+ return false;
+ }
+#else
const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
@@ -135,6 +196,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
{
return false;
}
+#endif
/* Calculate determinant. */
float det = U + V + W;