From 844dacc267391cd2a725d81c2495636f0765771b Mon Sep 17 00:00:00 2001 From: Jianyu Huang Date: Thu, 7 Mar 2019 18:02:55 -0800 Subject: Fixes for FBGEMM FP16 performance (#82) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/82 This is a quick fix for matching FBGEMM FP16 performance with SKINNY GEMM FP16. Basically, this Diff switches the register layout in C accumulation buffer inside micro-kernel from MR * 1 to MR * 2. Check the reasons in T40816746. Reviewed By: zhengwy888 Differential Revision: D14278430 fbshipit-source-id: 961dd681deee69e2b7fec6bcdba7920e0b09134a --- src/FbgemmFP16.cc | 296 ++++--- src/FbgemmFP16UKernelsAvx2.cc | 1834 ++++------------------------------------- src/FbgemmFP16UKernelsAvx2.h | 20 +- src/codegen_fp16fp32.cc | 134 ++- 4 files changed, 374 insertions(+), 1910 deletions(-) (limited to 'src') diff --git a/src/FbgemmFP16.cc b/src/FbgemmFP16.cc index d3d5c1f..868bc1b 100644 --- a/src/FbgemmFP16.cc +++ b/src/FbgemmFP16.cc @@ -35,23 +35,18 @@ inline void PackA(int nrow, int ncol, const float* from, int ldim, float* to) { struct KernelInfo { using knl_ptr = funcptr_fp16; // optimized kernels to cover all cases - static constexpr array kernel = { + // 2 in ?x2 should be the same as kernel_ncol_blocks. + // Here with kernel_ncol_blocks = 2, we can provide up to 6x2 kernels, due to + // the restrictions of ymm register numbers (16). + static constexpr array kernel = { { nullptr, - gemmkernel_1x1_AVX2_fA0fB0fC0, - gemmkernel_2x1_AVX2_fA0fB0fC0, - gemmkernel_3x1_AVX2_fA0fB0fC0, - gemmkernel_4x1_AVX2_fA0fB0fC0, - gemmkernel_5x1_AVX2_fA0fB0fC0, - gemmkernel_6x1_AVX2_fA0fB0fC0, - gemmkernel_7x1_AVX2_fA0fB0fC0, - gemmkernel_8x1_AVX2_fA0fB0fC0, - gemmkernel_9x1_AVX2_fA0fB0fC0, - gemmkernel_10x1_AVX2_fA0fB0fC0, - gemmkernel_11x1_AVX2_fA0fB0fC0, - gemmkernel_12x1_AVX2_fA0fB0fC0, - gemmkernel_13x1_AVX2_fA0fB0fC0, - gemmkernel_14x1_AVX2_fA0fB0fC0 + gemmkernel_1x2_AVX2_fA0fB0fC0, + gemmkernel_2x2_AVX2_fA0fB0fC0, + gemmkernel_3x2_AVX2_fA0fB0fC0, + gemmkernel_4x2_AVX2_fA0fB0fC0, + gemmkernel_5x2_AVX2_fA0fB0fC0, + gemmkernel_6x2_AVX2_fA0fB0fC0 } }; @@ -61,131 +56,131 @@ struct KernelInfo { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. { - {{ { 0, 0 }, { 0, 0 } } }, - {{ { 1, 1 }, { 0, 0 } } }, - {{ { 2, 1 }, { 0, 0 } } }, - {{ { 3, 1 }, { 0, 0 } } }, - {{ { 4, 1 }, { 0, 0 } } }, - {{ { 5, 1 }, { 0, 0 } } }, - {{ { 6, 1 }, { 0, 0 } } }, - {{ { 7, 1 }, { 0, 0 } } }, - {{ { 8, 1 }, { 0, 0 } } }, - {{ { 9, 1 }, { 0, 0 } } }, - {{ { 10, 1 }, { 0, 0 } } }, - {{ { 11, 1 }, { 0, 0 } } }, - {{ { 12, 1 }, { 0, 0 } } }, - {{ { 13, 1 }, { 0, 0 } } }, - {{ { 14, 1 }, { 0, 0 } } }, - {{ { 8, 1 }, { 7, 1 } } }, - {{ { 10, 1 }, { 6, 1 } } }, - {{ { 11, 1 }, { 6, 1 } } }, - {{ { 12, 1 }, { 6, 1 } } }, - {{ { 11, 1 }, { 8, 1 } } }, - {{ { 11, 1 }, { 9, 1 } } }, - {{ { 12, 1 }, { 9, 1 } } }, - {{ { 11, 2 }, { 0, 0 } } }, - {{ { 12, 1 }, { 11, 1 } } }, - {{ { 12, 2 }, { 0, 0 } } }, - {{ { 13, 1 }, { 12, 1 } } }, - {{ { 13, 2 }, { 0, 0 } } }, - {{ { 14, 1 }, { 13, 1 } } }, - {{ { 14, 2 }, { 0, 0 } } }, - {{ { 11, 2 }, { 7, 1 } } }, - {{ { 10, 3 }, { 0, 0 } } }, - {{ { 12, 2 }, { 7, 1 } } }, - {{ { 12, 2 }, { 8, 1 } } }, - {{ { 11, 3 }, { 0, 0 } } }, - {{ { 13, 2 }, { 8, 1 } } }, - {{ { 13, 2 }, { 9, 1 } } }, - {{ { 13, 2 }, { 10, 1 } } }, - {{ { 13, 2 }, { 11, 1 } } }, - {{ { 13, 2 }, { 12, 1 } } }, - {{ { 13, 3 }, { 0, 0 } } }, - {{ { 14, 2 }, { 12, 1 } } }, - {{ { 14, 2 }, { 13, 1 } } }, - {{ { 11, 3 }, { 9, 1 } } }, - {{ { 11, 3 }, { 10, 1 } } }, - {{ { 11, 4 }, { 0, 0 } } }, - {{ { 12, 3 }, { 9, 1 } } }, - {{ { 12, 3 }, { 10, 1 } } }, - {{ { 13, 3 }, { 8, 1 } } }, - {{ { 13, 3 }, { 9, 1 } } }, - {{ { 13, 3 }, { 10, 1 } } }, - {{ { 13, 3 }, { 11, 1 } } }, - {{ { 13, 3 }, { 12, 1 } } }, - {{ { 13, 4 }, { 0, 0 } } }, - {{ { 14, 3 }, { 11, 1 } } }, - {{ { 11, 4 }, { 10, 1 } } }, - {{ { 12, 4 }, { 7, 1 } } }, - {{ { 14, 4 }, { 0, 0 } } }, - {{ { 12, 4 }, { 9, 1 } } }, - {{ { 12, 4 }, { 10, 1 } } }, - {{ { 12, 4 }, { 11, 1 } } }, - {{ { 13, 4 }, { 8, 1 } } }, - {{ { 13, 4 }, { 9, 1 } } }, - {{ { 13, 4 }, { 10, 1 } } }, - {{ { 13, 4 }, { 11, 1 } } }, - {{ { 11, 5 }, { 9, 1 } } }, - {{ { 13, 5 }, { 0, 0 } } }, - {{ { 14, 4 }, { 10, 1 } } }, - {{ { 12, 5 }, { 7, 1 } } }, - {{ { 12, 5 }, { 8, 1 } } }, - {{ { 14, 4 }, { 13, 1 } } }, - {{ { 14, 5 }, { 0, 0 } } }, - {{ { 12, 5 }, { 11, 1 } } }, - {{ { 13, 5 }, { 7, 1 } } }, - {{ { 11, 6 }, { 7, 1 } } }, - {{ { 13, 5 }, { 9, 1 } } }, - {{ { 13, 5 }, { 10, 1 } } }, - {{ { 13, 5 }, { 11, 1 } } }, - {{ { 13, 5 }, { 12, 1 } } }, - {{ { 13, 6 }, { 0, 0 } } }, - {{ { 12, 6 }, { 7, 1 } } }, - {{ { 12, 6 }, { 8, 1 } } }, - {{ { 12, 6 }, { 9, 1 } } }, - {{ { 12, 6 }, { 10, 1 } } }, - {{ { 12, 6 }, { 11, 1 } } }, - {{ { 12, 7 }, { 0, 0 } } }, - {{ { 13, 6 }, { 7, 1 } } }, - {{ { 13, 6 }, { 8, 1 } } }, - {{ { 13, 6 }, { 9, 1 } } }, - {{ { 13, 6 }, { 10, 1 } } }, - {{ { 13, 6 }, { 11, 1 } } }, - {{ { 13, 6 }, { 12, 1 } } }, - {{ { 13, 7 }, { 0, 0 } } }, - {{ { 12, 7 }, { 8, 1 } } }, - {{ { 12, 7 }, { 9, 1 } } }, - {{ { 14, 6 }, { 10, 1 } } }, - {{ { 12, 7 }, { 11, 1 } } }, - {{ { 13, 7 }, { 5, 1 } } }, - {{ { 13, 7 }, { 6, 1 } } }, - {{ { 13, 7 }, { 7, 1 } } }, - {{ { 13, 7 }, { 8, 1 } } }, - {{ { 13, 7 }, { 9, 1 } } }, - {{ { 13, 7 }, { 10, 1 } } }, - {{ { 13, 7 }, { 11, 1 } } }, - {{ { 13, 7 }, { 12, 1 } } }, - {{ { 12, 8 }, { 8, 1 } } }, - {{ { 12, 8 }, { 9, 1 } } }, - {{ { 12, 8 }, { 10, 1 } } }, - {{ { 12, 8 }, { 11, 1 } } }, - {{ { 12, 9 }, { 0, 0 } } }, - {{ { 11, 9 }, { 10, 1 } } }, - {{ { 13, 8 }, { 6, 1 } } }, - {{ { 13, 8 }, { 7, 1 } } }, - {{ { 13, 8 }, { 8, 1 } } }, - {{ { 13, 8 }, { 9, 1 } } }, - {{ { 13, 8 }, { 10, 1 } } }, - {{ { 13, 8 }, { 11, 1 } } }, - {{ { 12, 9 }, { 8, 1 } } }, - {{ { 13, 9 }, { 0, 0 } } }, - {{ { 12, 9 }, { 10, 1 } } }, - {{ { 12, 9 }, { 11, 1 } } }, - {{ { 12, 10 }, { 0, 0 } } } + {{ { 0, 0 }, { 0, 0 } } }, // 0 + {{ { 1, 1 }, { 0, 0 } } }, // 1 + {{ { 2, 1 }, { 0, 0 } } }, // 2 + {{ { 3, 1 }, { 0, 0 } } }, // 3 + {{ { 4, 1 }, { 0, 0 } } }, // 4 + {{ { 5, 1 }, { 0, 0 } } }, // 5 + {{ { 6, 1 }, { 0, 0 } } }, // 6 + {{ { 5, 1 }, { 2, 1 } } }, // 7 + {{ { 4, 2 }, { 0, 0 } } }, // 8 + {{ { 5, 1 }, { 4, 1 } } }, // 9 + {{ { 5, 2 }, { 0, 0 } } }, // 10 + {{ { 6, 1 }, { 5, 1 } } }, // 11 + {{ { 6, 2 }, { 0, 0 } } }, // 12 + {{ { 5, 2 }, { 3, 1 } } }, // 13 + {{ { 6, 2 }, { 2, 1 } } }, // 14 + {{ { 5, 3 }, { 0, 0 } } }, // 15 + {{ { 6, 2 }, { 4, 1 } } }, // 16 + {{ { 6, 2 }, { 5, 1 } } }, // 17 + {{ { 6, 3 }, { 0, 0 } } }, // 18 + {{ { 5, 3 }, { 4, 1 } } }, // 19 + {{ { 5, 4 }, { 0, 0 } } }, // 20 + {{ { 5, 3 }, { 6, 1 } } }, // 21 + {{ { 6, 3 }, { 4, 1 } } }, // 22 + {{ { 6, 3 }, { 5, 1 } } }, // 23 + {{ { 6, 4 }, { 0, 0 } } }, // 24 + {{ { 5, 5 }, { 0, 0 } } }, // 25 + {{ { 5, 4 }, { 6, 1 } } }, // 26 + {{ { 6, 4 }, { 3, 1 } } }, // 27 + {{ { 6, 4 }, { 4, 1 } } }, // 28 + {{ { 6, 4 }, { 5, 1 } } }, // 29 + {{ { 6, 5 }, { 0, 0 } } }, // 30 + {{ { 6, 5 }, { 1, 1 } } }, // 31 + {{ { 6, 5 }, { 2, 1 } } }, // 32 + {{ { 6, 5 }, { 3, 1 } } }, // 33 + {{ { 6, 5 }, { 4, 1 } } }, // 34 + {{ { 6, 5 }, { 5, 1 } } }, // 35 + {{ { 6, 6 }, { 0, 0 } } }, // 36 + {{ { 6, 6 }, { 1, 1 } } }, // 37 + {{ { 6, 6 }, { 2, 1 } } }, // 38 + {{ { 6, 6 }, { 3, 1 } } }, // 39 + {{ { 6, 6 }, { 4, 1 } } }, // 40 + {{ { 6, 6 }, { 5, 1 } } }, // 41 + {{ { 6, 7 }, { 0, 0 } } }, // 42 + {{ { 6, 7 }, { 1, 1 } } }, // 43 + {{ { 6, 7 }, { 2, 1 } } }, // 44 + {{ { 6, 7 }, { 3, 1 } } }, // 45 + {{ { 6, 7 }, { 4, 1 } } }, // 46 + {{ { 6, 7 }, { 5, 1 } } }, // 47 + {{ { 6, 8 }, { 0, 0 } } }, // 48 + {{ { 6, 8 }, { 1, 1 } } }, // 49 + {{ { 6, 8 }, { 2, 1 } } }, // 50 + {{ { 6, 8 }, { 3, 1 } } }, // 51 + {{ { 6, 8 }, { 4, 1 } } }, // 52 + {{ { 6, 8 }, { 5, 1 } } }, // 53 + {{ { 6, 9 }, { 0, 0 } } }, // 54 + {{ { 6, 9 }, { 1, 1 } } }, // 55 + {{ { 6, 9 }, { 2, 1 } } }, // 56 + {{ { 6, 9 }, { 3, 1 } } }, // 57 + {{ { 6, 9 }, { 4, 1 } } }, // 58 + {{ { 6, 9 }, { 5, 1 } } }, // 59 + {{ { 6, 10 }, { 0, 0 } } }, // 60 + {{ { 6, 10 }, { 1, 1 } } }, // 61 + {{ { 6, 10 }, { 2, 1 } } }, // 62 + {{ { 6, 10 }, { 3, 1 } } }, // 63 + {{ { 6, 10 }, { 4, 1 } } }, // 64 + {{ { 6, 10 }, { 5, 1 } } }, // 65 + {{ { 6, 11 }, { 0, 0 } } }, // 66 + {{ { 6, 11 }, { 1, 1 } } }, // 67 + {{ { 6, 11 }, { 2, 1 } } }, // 68 + {{ { 6, 11 }, { 3, 1 } } }, // 69 + {{ { 6, 11 }, { 4, 1 } } }, // 70 + {{ { 6, 11 }, { 5, 1 } } }, // 71 + {{ { 6, 12 }, { 0, 0 } } }, // 72 + {{ { 6, 12 }, { 1, 1 } } }, // 73 + {{ { 6, 12 }, { 2, 1 } } }, // 74 + {{ { 6, 12 }, { 3, 1 } } }, // 75 + {{ { 6, 12 }, { 4, 1 } } }, // 76 + {{ { 6, 12 }, { 5, 1 } } }, // 77 + {{ { 6, 13 }, { 0, 0 } } }, // 78 + {{ { 6, 13 }, { 1, 1 } } }, // 79 + {{ { 6, 13 }, { 2, 1 } } }, // 80 + {{ { 6, 13 }, { 3, 1 } } }, // 81 + {{ { 6, 13 }, { 4, 1 } } }, // 82 + {{ { 6, 13 }, { 5, 1 } } }, // 83 + {{ { 6, 14 }, { 0, 0 } } }, // 84 + {{ { 6, 14 }, { 1, 1 } } }, // 85 + {{ { 6, 14 }, { 2, 1 } } }, // 86 + {{ { 6, 14 }, { 3, 1 } } }, // 87 + {{ { 6, 14 }, { 4, 1 } } }, // 88 + {{ { 6, 14 }, { 5, 1 } } }, // 89 + {{ { 6, 15 }, { 0, 0 } } }, // 90 + {{ { 6, 15 }, { 1, 1 } } }, // 91 + {{ { 6, 15 }, { 2, 1 } } }, // 92 + {{ { 6, 15 }, { 3, 1 } } }, // 93 + {{ { 6, 15 }, { 4, 1 } } }, // 94 + {{ { 6, 15 }, { 5, 1 } } }, // 95 + {{ { 6, 16 }, { 0, 0 } } }, // 96 + {{ { 6, 16 }, { 1, 1 } } }, // 97 + {{ { 6, 16 }, { 2, 1 } } }, // 98 + {{ { 6, 16 }, { 3, 1 } } }, // 99 + {{ { 6, 16 }, { 4, 1 } } }, // 100 + {{ { 6, 16 }, { 5, 1 } } }, // 101 + {{ { 6, 17 }, { 0, 0 } } }, // 102 + {{ { 6, 17 }, { 1, 1 } } }, // 103 + {{ { 6, 17 }, { 2, 1 } } }, // 104 + {{ { 6, 17 }, { 3, 1 } } }, // 105 + {{ { 6, 17 }, { 4, 1 } } }, // 106 + {{ { 6, 17 }, { 5, 1 } } }, // 107 + {{ { 6, 18 }, { 0, 0 } } }, // 108 + {{ { 6, 18 }, { 1, 1 } } }, // 109 + {{ { 6, 18 }, { 2, 1 } } }, // 110 + {{ { 6, 18 }, { 3, 1 } } }, // 111 + {{ { 6, 18 }, { 4, 1 } } }, // 112 + {{ { 6, 18 }, { 5, 1 } } }, // 113 + {{ { 6, 19 }, { 0, 0 } } }, // 114 + {{ { 6, 19 }, { 1, 1 } } }, // 115 + {{ { 6, 19 }, { 2, 1 } } }, // 116 + {{ { 6, 19 }, { 3, 1 } } }, // 117 + {{ { 6, 19 }, { 4, 1 } } }, // 118 + {{ { 6, 19 }, { 5, 1 } } }, // 119 + {{ { 6, 20 }, { 0, 0 } } }, // 120 } }; }; -constexpr array KernelInfo::kernel; +constexpr array KernelInfo::kernel; constexpr array, 2>, 121> KernelInfo::partition; // autotuned kernel splits for various cases m = 1:mb_max @@ -208,8 +203,8 @@ FBGEMM_API void cblas_gemm_compute( const int n = Bp.numCols(), k = Bp.numRows(), ldc = n; const int mb_max = 120; constexpr int simd_width = 8; - constexpr int kernel_ncol_blocks = 1; - constexpr int kernel_ncols = kernel_ncol_blocks * simd_width; + int kernel_ncol_blocks = Bp.kernelNumColBlocks(); + int kernel_ncols = kernel_ncol_blocks * simd_width; // private scratchpad storage static thread_local unique_ptr> scratchpad( @@ -267,7 +262,7 @@ FBGEMM_API void cblas_gemm_compute( fbgemmGetRange( num_threads, thread_id, gp.b_block_cols, 1, jb_begin, jb_end); gp.B += gp.k * Bp.blockColSize() * jb_begin; - gp.C += 8 * jb_begin; + gp.C += Bp.blockColSize() * jb_begin; gp.b_block_cols = jb_end - jb_begin; if (gp.b_block_cols) { KernelInfo::kernel[kernel_nrows](&gp); @@ -279,7 +274,7 @@ FBGEMM_API void cblas_gemm_compute( fbgemmGetRange( num_threads, thread_id, gp.b_block_cols, 1, jb_begin, jb_end); gp.B += gp.k * Bp.blockColSize() * jb_begin; - gp.C += 8 * jb_begin; + gp.C += Bp.blockColSize() * jb_begin; gp.b_block_cols = jb_end - jb_begin; if (gp.b_block_cols) { KernelInfo::kernel[kernel_nrows](&gp); @@ -291,35 +286,36 @@ FBGEMM_API void cblas_gemm_compute( // leftover int rem = n - last_blk_col; assert(rem < kernel_ncols); - int b = (rem % simd_width) ? ((rem + simd_width) / simd_width) - : (rem / simd_width); - assert(b == 1); - if ((rem % simd_width) == 0) { + + if ((rem % Bp.blockColSize()) == 0) { gp.B = &(Bp(k_ind, last_blk_col)); gp.C = &C[m2 * ldc + last_blk_col]; gp.b_block_cols = 1; KernelInfo::kernel[kernel_nrows](&gp); } else { - // small temporary buffer + // small temporary buffer: the size should be larger than the + // required kernel_nrow x kernel_ncols elements computed in the + // registers. float c_tmp[16 * 24] = {0}; assert((16 * 24) > kernel_nrows * kernel_ncols); gp.B = &(Bp(k_ind, last_blk_col)); gp.C = c_tmp; - gp.ldc = 8 * sizeof(C[0]); + gp.ldc = kernel_ncols * sizeof(C[0]); gp.b_block_cols = 1; KernelInfo::kernel[kernel_nrows](&gp); for (int i = 0; i < kernel_nrows; i++) { // Todo: use assembly for (int j = last_blk_col; j < n; j++) { assert( - i * 8 + (j - last_blk_col) < + i * kernel_ncols + (j - last_blk_col) < sizeof(c_tmp) / sizeof(c_tmp[0])); if (accum == 0) { - C[(m2 + i) * ldc + j] = c_tmp[i * 8 + (j - last_blk_col)]; + C[(m2 + i) * ldc + j] = + c_tmp[i * kernel_ncols + (j - last_blk_col)]; } else { C[(m2 + i) * ldc + j] = beta_ * C[(m2 + i) * ldc + j] + - c_tmp[i * 8 + (j - last_blk_col)]; + c_tmp[i * kernel_ncols + (j - last_blk_col)]; } } } diff --git a/src/FbgemmFP16UKernelsAvx2.cc b/src/FbgemmFP16UKernelsAvx2.cc index 8a0cb0d..0c795b0 100644 --- a/src/FbgemmFP16UKernelsAvx2.cc +++ b/src/FbgemmFP16UKernelsAvx2.cc @@ -8,1095 +8,7 @@ namespace fbgemm { -void __attribute__((noinline)) gemmkernel_1x1_AVX2_fA0fB0fC0(GemmParams* gp) { - asm volatile( -#if !defined(__clang__) - "mov r14, %[gp]\t\n" -#else - "mov %[gp], %%r14\t\n" - ".intel_syntax noprefix\t\n" -#endif - - // Copy parameters - // k - "mov r8, [r14 + 0]\t\n" - // A - "mov r9, [r14 + 8]\t\n" - // B - "mov r10, [r14 + 16]\t\n" - // beta - "mov r15, [r14 + 24]\t\n" - // accum - "mov rdx, [r14 + 32]\t\n" - // C - "mov r12, [r14 + 40]\t\n" - // ldc - "mov r13, [r14 + 48]\t\n" - // b_block_cols - "mov rdi, [r14 + 56]\t\n" - // b_block_size - "mov rsi, [r14 + 64]\t\n" - // Make copies of A and C - "mov rax, r9\t\n" - "mov rcx, r12\t\n" - - "mov rbx, 0\t\n" - "loop_outter%=:\t\n" - "mov r14, 0\t\n" - "vxorps ymm0,ymm0,ymm0\t\n" - - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" - - "loop_inner%=:\t\n" - - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm1,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm1\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" - "inc r14\t\n" - "vbroadcastss ymm1,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm0,ymm14,ymm1\t\n" - "add r11, 32\t\n" - "add r9,8\t\n" - "cmp r14, r8\t\n" - "jl loop_inner%=\t\n" - - "L_exit%=:\t\n" - "add r10, rsi\t\n" - - "cmp rdx, 1\t\n" - "je L_accum%=\t\n" - // Dump C - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "jmp L_done%=\t\n" - - "L_accum%=:\t\n" - // Dump C with accumulate - "vbroadcastss ymm15,DWORD PTR [r15]\t\n" - "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - - "L_done%=:\t\n" - - // next outer iteration - "add rcx, 32\t\n" - "mov r12, rcx\t\n" - "mov r9, rax\t\n" - "inc rbx\t\n" - "cmp rbx, rdi\t\n" - "jl loop_outter%=\t\n" - : - : [gp] "rm"(gp) - : "r8", - "r9", - "r10", - "r11", - "r15", - "r13", - "r14", - "rax", - "rcx", - "rdx", - "rsi", - "rdi", - "rbx", - "r12", - "memory"); -} -void __attribute__((noinline)) gemmkernel_2x1_AVX2_fA0fB0fC0(GemmParams* gp) { - asm volatile( -#if !defined(__clang__) - "mov r14, %[gp]\t\n" -#else - "mov %[gp], %%r14\t\n" - ".intel_syntax noprefix\t\n" -#endif - - // Copy parameters - // k - "mov r8, [r14 + 0]\t\n" - // A - "mov r9, [r14 + 8]\t\n" - // B - "mov r10, [r14 + 16]\t\n" - // beta - "mov r15, [r14 + 24]\t\n" - // accum - "mov rdx, [r14 + 32]\t\n" - // C - "mov r12, [r14 + 40]\t\n" - // ldc - "mov r13, [r14 + 48]\t\n" - // b_block_cols - "mov rdi, [r14 + 56]\t\n" - // b_block_size - "mov rsi, [r14 + 64]\t\n" - // Make copies of A and C - "mov rax, r9\t\n" - "mov rcx, r12\t\n" - - "mov rbx, 0\t\n" - "loop_outter%=:\t\n" - "mov r14, 0\t\n" - "vxorps ymm0,ymm0,ymm0\t\n" - "vxorps ymm1,ymm1,ymm1\t\n" - - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" - - "loop_inner%=:\t\n" - - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm2,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm2\t\n" - "vbroadcastss ymm2,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm2\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" - "inc r14\t\n" - "vbroadcastss ymm2,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm0,ymm14,ymm2\t\n" - "vbroadcastss ymm2,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm1,ymm14,ymm2\t\n" - "add r11, 32\t\n" - "add r9,16\t\n" - "cmp r14, r8\t\n" - "jl loop_inner%=\t\n" - - "L_exit%=:\t\n" - "add r10, rsi\t\n" - - "cmp rdx, 1\t\n" - "je L_accum%=\t\n" - // Dump C - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "jmp L_done%=\t\n" - - "L_accum%=:\t\n" - // Dump C with accumulate - "vbroadcastss ymm15,DWORD PTR [r15]\t\n" - "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - - "L_done%=:\t\n" - - // next outer iteration - "add rcx, 32\t\n" - "mov r12, rcx\t\n" - "mov r9, rax\t\n" - "inc rbx\t\n" - "cmp rbx, rdi\t\n" - "jl loop_outter%=\t\n" - : - : [gp] "rm"(gp) - : "r8", - "r9", - "r10", - "r11", - "r15", - "r13", - "r14", - "rax", - "rcx", - "rdx", - "rsi", - "rdi", - "rbx", - "r12", - "memory"); -} -void __attribute__((noinline)) gemmkernel_3x1_AVX2_fA0fB0fC0(GemmParams* gp) { - asm volatile( -#if !defined(__clang__) - "mov r14, %[gp]\t\n" -#else - "mov %[gp], %%r14\t\n" - ".intel_syntax noprefix\t\n" -#endif - - // Copy parameters - // k - "mov r8, [r14 + 0]\t\n" - // A - "mov r9, [r14 + 8]\t\n" - // B - "mov r10, [r14 + 16]\t\n" - // beta - "mov r15, [r14 + 24]\t\n" - // accum - "mov rdx, [r14 + 32]\t\n" - // C - "mov r12, [r14 + 40]\t\n" - // ldc - "mov r13, [r14 + 48]\t\n" - // b_block_cols - "mov rdi, [r14 + 56]\t\n" - // b_block_size - "mov rsi, [r14 + 64]\t\n" - // Make copies of A and C - "mov rax, r9\t\n" - "mov rcx, r12\t\n" - - "mov rbx, 0\t\n" - "loop_outter%=:\t\n" - "mov r14, 0\t\n" - "vxorps ymm0,ymm0,ymm0\t\n" - "vxorps ymm1,ymm1,ymm1\t\n" - "vxorps ymm2,ymm2,ymm2\t\n" - - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" - - "loop_inner%=:\t\n" - - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm3,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm3\t\n" - "vbroadcastss ymm3,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm3\t\n" - "vbroadcastss ymm3,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm3\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" - "inc r14\t\n" - "vbroadcastss ymm3,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm0,ymm14,ymm3\t\n" - "vbroadcastss ymm3,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm1,ymm14,ymm3\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm3,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm2,ymm14,ymm3\t\n" - "add r9,24\t\n" - "cmp r14, r8\t\n" - "jl loop_inner%=\t\n" - - "L_exit%=:\t\n" - "add r10, rsi\t\n" - - "cmp rdx, 1\t\n" - "je L_accum%=\t\n" - // Dump C - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "jmp L_done%=\t\n" - - "L_accum%=:\t\n" - // Dump C with accumulate - "vbroadcastss ymm15,DWORD PTR [r15]\t\n" - "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - - "L_done%=:\t\n" - - // next outer iteration - "add rcx, 32\t\n" - "mov r12, rcx\t\n" - "mov r9, rax\t\n" - "inc rbx\t\n" - "cmp rbx, rdi\t\n" - "jl loop_outter%=\t\n" - : - : [gp] "rm"(gp) - : "r8", - "r9", - "r10", - "r11", - "r15", - "r13", - "r14", - "rax", - "rcx", - "rdx", - "rsi", - "rdi", - "rbx", - "r12", - "memory"); -} -void __attribute__((noinline)) gemmkernel_4x1_AVX2_fA0fB0fC0(GemmParams* gp) { - asm volatile( -#if !defined(__clang__) - "mov r14, %[gp]\t\n" -#else - "mov %[gp], %%r14\t\n" - ".intel_syntax noprefix\t\n" -#endif - - // Copy parameters - // k - "mov r8, [r14 + 0]\t\n" - // A - "mov r9, [r14 + 8]\t\n" - // B - "mov r10, [r14 + 16]\t\n" - // beta - "mov r15, [r14 + 24]\t\n" - // accum - "mov rdx, [r14 + 32]\t\n" - // C - "mov r12, [r14 + 40]\t\n" - // ldc - "mov r13, [r14 + 48]\t\n" - // b_block_cols - "mov rdi, [r14 + 56]\t\n" - // b_block_size - "mov rsi, [r14 + 64]\t\n" - // Make copies of A and C - "mov rax, r9\t\n" - "mov rcx, r12\t\n" - - "mov rbx, 0\t\n" - "loop_outter%=:\t\n" - "mov r14, 0\t\n" - "vxorps ymm0,ymm0,ymm0\t\n" - "vxorps ymm1,ymm1,ymm1\t\n" - "vxorps ymm2,ymm2,ymm2\t\n" - "vxorps ymm3,ymm3,ymm3\t\n" - - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" - - "loop_inner%=:\t\n" - - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm4,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm4\t\n" - "vbroadcastss ymm4,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm4\t\n" - "vbroadcastss ymm4,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm4\t\n" - "vbroadcastss ymm4,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm4\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" - "inc r14\t\n" - "vbroadcastss ymm4,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm0,ymm14,ymm4\t\n" - "vbroadcastss ymm4,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm1,ymm14,ymm4\t\n" - "vbroadcastss ymm4,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm2,ymm14,ymm4\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm4,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm3,ymm14,ymm4\t\n" - "add r9,32\t\n" - "cmp r14, r8\t\n" - "jl loop_inner%=\t\n" - - "L_exit%=:\t\n" - "add r10, rsi\t\n" - - "cmp rdx, 1\t\n" - "je L_accum%=\t\n" - // Dump C - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "jmp L_done%=\t\n" - - "L_accum%=:\t\n" - // Dump C with accumulate - "vbroadcastss ymm15,DWORD PTR [r15]\t\n" - "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - - "L_done%=:\t\n" - - // next outer iteration - "add rcx, 32\t\n" - "mov r12, rcx\t\n" - "mov r9, rax\t\n" - "inc rbx\t\n" - "cmp rbx, rdi\t\n" - "jl loop_outter%=\t\n" - : - : [gp] "rm"(gp) - : "r8", - "r9", - "r10", - "r11", - "r15", - "r13", - "r14", - "rax", - "rcx", - "rdx", - "rsi", - "rdi", - "rbx", - "r12", - "memory"); -} -void __attribute__((noinline)) gemmkernel_5x1_AVX2_fA0fB0fC0(GemmParams* gp) { - asm volatile( -#if !defined(__clang__) - "mov r14, %[gp]\t\n" -#else - "mov %[gp], %%r14\t\n" - ".intel_syntax noprefix\t\n" -#endif - - // Copy parameters - // k - "mov r8, [r14 + 0]\t\n" - // A - "mov r9, [r14 + 8]\t\n" - // B - "mov r10, [r14 + 16]\t\n" - // beta - "mov r15, [r14 + 24]\t\n" - // accum - "mov rdx, [r14 + 32]\t\n" - // C - "mov r12, [r14 + 40]\t\n" - // ldc - "mov r13, [r14 + 48]\t\n" - // b_block_cols - "mov rdi, [r14 + 56]\t\n" - // b_block_size - "mov rsi, [r14 + 64]\t\n" - // Make copies of A and C - "mov rax, r9\t\n" - "mov rcx, r12\t\n" - - "mov rbx, 0\t\n" - "loop_outter%=:\t\n" - "mov r14, 0\t\n" - "vxorps ymm0,ymm0,ymm0\t\n" - "vxorps ymm1,ymm1,ymm1\t\n" - "vxorps ymm2,ymm2,ymm2\t\n" - "vxorps ymm3,ymm3,ymm3\t\n" - "vxorps ymm4,ymm4,ymm4\t\n" - - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" - - "loop_inner%=:\t\n" - - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm5\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm5\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm5\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm5\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm5\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" - "inc r14\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm0,ymm14,ymm5\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm1,ymm14,ymm5\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm2,ymm14,ymm5\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm3,ymm14,ymm5\t\n" - "vbroadcastss ymm5,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm4,ymm14,ymm5\t\n" - "add r9,40\t\n" - "cmp r14, r8\t\n" - "jl loop_inner%=\t\n" - - "L_exit%=:\t\n" - "add r10, rsi\t\n" - - "cmp rdx, 1\t\n" - "je L_accum%=\t\n" - // Dump C - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "jmp L_done%=\t\n" - - "L_accum%=:\t\n" - // Dump C with accumulate - "vbroadcastss ymm15,DWORD PTR [r15]\t\n" - "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - - "L_done%=:\t\n" - - // next outer iteration - "add rcx, 32\t\n" - "mov r12, rcx\t\n" - "mov r9, rax\t\n" - "inc rbx\t\n" - "cmp rbx, rdi\t\n" - "jl loop_outter%=\t\n" - : - : [gp] "rm"(gp) - : "r8", - "r9", - "r10", - "r11", - "r15", - "r13", - "r14", - "rax", - "rcx", - "rdx", - "rsi", - "rdi", - "rbx", - "r12", - "memory"); -} -void __attribute__((noinline)) gemmkernel_6x1_AVX2_fA0fB0fC0(GemmParams* gp) { - asm volatile( -#if !defined(__clang__) - "mov r14, %[gp]\t\n" -#else - "mov %[gp], %%r14\t\n" - ".intel_syntax noprefix\t\n" -#endif - - // Copy parameters - // k - "mov r8, [r14 + 0]\t\n" - // A - "mov r9, [r14 + 8]\t\n" - // B - "mov r10, [r14 + 16]\t\n" - // beta - "mov r15, [r14 + 24]\t\n" - // accum - "mov rdx, [r14 + 32]\t\n" - // C - "mov r12, [r14 + 40]\t\n" - // ldc - "mov r13, [r14 + 48]\t\n" - // b_block_cols - "mov rdi, [r14 + 56]\t\n" - // b_block_size - "mov rsi, [r14 + 64]\t\n" - // Make copies of A and C - "mov rax, r9\t\n" - "mov rcx, r12\t\n" - - "mov rbx, 0\t\n" - "loop_outter%=:\t\n" - "mov r14, 0\t\n" - "vxorps ymm0,ymm0,ymm0\t\n" - "vxorps ymm1,ymm1,ymm1\t\n" - "vxorps ymm2,ymm2,ymm2\t\n" - "vxorps ymm3,ymm3,ymm3\t\n" - "vxorps ymm4,ymm4,ymm4\t\n" - "vxorps ymm5,ymm5,ymm5\t\n" - - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" - - "loop_inner%=:\t\n" - - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm6\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm6\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm6\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm6\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm6\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm5,ymm15,ymm6\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" - "inc r14\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm0,ymm14,ymm6\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm1,ymm14,ymm6\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm2,ymm14,ymm6\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm3,ymm14,ymm6\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+40]\t\n" - "vfmadd231ps ymm4,ymm14,ymm6\t\n" - "vbroadcastss ymm6,DWORD PTR [r9+44]\t\n" - "vfmadd231ps ymm5,ymm14,ymm6\t\n" - "add r9,48\t\n" - "cmp r14, r8\t\n" - "jl loop_inner%=\t\n" - - "L_exit%=:\t\n" - "add r10, rsi\t\n" - - "cmp rdx, 1\t\n" - "je L_accum%=\t\n" - // Dump C - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "jmp L_done%=\t\n" - - "L_accum%=:\t\n" - // Dump C with accumulate - "vbroadcastss ymm15,DWORD PTR [r15]\t\n" - "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - - "L_done%=:\t\n" - - // next outer iteration - "add rcx, 32\t\n" - "mov r12, rcx\t\n" - "mov r9, rax\t\n" - "inc rbx\t\n" - "cmp rbx, rdi\t\n" - "jl loop_outter%=\t\n" - : - : [gp] "rm"(gp) - : "r8", - "r9", - "r10", - "r11", - "r15", - "r13", - "r14", - "rax", - "rcx", - "rdx", - "rsi", - "rdi", - "rbx", - "r12", - "memory"); -} -void __attribute__((noinline)) gemmkernel_7x1_AVX2_fA0fB0fC0(GemmParams* gp) { - asm volatile( -#if !defined(__clang__) - "mov r14, %[gp]\t\n" -#else - "mov %[gp], %%r14\t\n" - ".intel_syntax noprefix\t\n" -#endif - - // Copy parameters - // k - "mov r8, [r14 + 0]\t\n" - // A - "mov r9, [r14 + 8]\t\n" - // B - "mov r10, [r14 + 16]\t\n" - // beta - "mov r15, [r14 + 24]\t\n" - // accum - "mov rdx, [r14 + 32]\t\n" - // C - "mov r12, [r14 + 40]\t\n" - // ldc - "mov r13, [r14 + 48]\t\n" - // b_block_cols - "mov rdi, [r14 + 56]\t\n" - // b_block_size - "mov rsi, [r14 + 64]\t\n" - // Make copies of A and C - "mov rax, r9\t\n" - "mov rcx, r12\t\n" - - "mov rbx, 0\t\n" - "loop_outter%=:\t\n" - "mov r14, 0\t\n" - "vxorps ymm0,ymm0,ymm0\t\n" - "vxorps ymm1,ymm1,ymm1\t\n" - "vxorps ymm2,ymm2,ymm2\t\n" - "vxorps ymm3,ymm3,ymm3\t\n" - "vxorps ymm4,ymm4,ymm4\t\n" - "vxorps ymm5,ymm5,ymm5\t\n" - "vxorps ymm6,ymm6,ymm6\t\n" - - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" - - "loop_inner%=:\t\n" - - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm5,ymm15,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm6,ymm15,ymm7\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" - "inc r14\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm0,ymm14,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm1,ymm14,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm2,ymm14,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+40]\t\n" - "vfmadd231ps ymm3,ymm14,ymm7\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+44]\t\n" - "vfmadd231ps ymm4,ymm14,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+48]\t\n" - "vfmadd231ps ymm5,ymm14,ymm7\t\n" - "vbroadcastss ymm7,DWORD PTR [r9+52]\t\n" - "vfmadd231ps ymm6,ymm14,ymm7\t\n" - "add r9,56\t\n" - "cmp r14, r8\t\n" - "jl loop_inner%=\t\n" - - "L_exit%=:\t\n" - "add r10, rsi\t\n" - - "cmp rdx, 1\t\n" - "je L_accum%=\t\n" - // Dump C - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "jmp L_done%=\t\n" - - "L_accum%=:\t\n" - // Dump C with accumulate - "vbroadcastss ymm15,DWORD PTR [r15]\t\n" - "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - - "L_done%=:\t\n" - - // next outer iteration - "add rcx, 32\t\n" - "mov r12, rcx\t\n" - "mov r9, rax\t\n" - "inc rbx\t\n" - "cmp rbx, rdi\t\n" - "jl loop_outter%=\t\n" - : - : [gp] "rm"(gp) - : "r8", - "r9", - "r10", - "r11", - "r15", - "r13", - "r14", - "rax", - "rcx", - "rdx", - "rsi", - "rdi", - "rbx", - "r12", - "memory"); -} -void __attribute__((noinline)) gemmkernel_8x1_AVX2_fA0fB0fC0(GemmParams* gp) { - asm volatile( -#if !defined(__clang__) - "mov r14, %[gp]\t\n" -#else - "mov %[gp], %%r14\t\n" - ".intel_syntax noprefix\t\n" -#endif - - // Copy parameters - // k - "mov r8, [r14 + 0]\t\n" - // A - "mov r9, [r14 + 8]\t\n" - // B - "mov r10, [r14 + 16]\t\n" - // beta - "mov r15, [r14 + 24]\t\n" - // accum - "mov rdx, [r14 + 32]\t\n" - // C - "mov r12, [r14 + 40]\t\n" - // ldc - "mov r13, [r14 + 48]\t\n" - // b_block_cols - "mov rdi, [r14 + 56]\t\n" - // b_block_size - "mov rsi, [r14 + 64]\t\n" - // Make copies of A and C - "mov rax, r9\t\n" - "mov rcx, r12\t\n" - - "mov rbx, 0\t\n" - "loop_outter%=:\t\n" - "mov r14, 0\t\n" - "vxorps ymm0,ymm0,ymm0\t\n" - "vxorps ymm1,ymm1,ymm1\t\n" - "vxorps ymm2,ymm2,ymm2\t\n" - "vxorps ymm3,ymm3,ymm3\t\n" - "vxorps ymm4,ymm4,ymm4\t\n" - "vxorps ymm5,ymm5,ymm5\t\n" - "vxorps ymm6,ymm6,ymm6\t\n" - "vxorps ymm7,ymm7,ymm7\t\n" - - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" - - "loop_inner%=:\t\n" - - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm5,ymm15,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm6,ymm15,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm7,ymm15,ymm8\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" - "inc r14\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm0,ymm14,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm1,ymm14,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+40]\t\n" - "vfmadd231ps ymm2,ymm14,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+44]\t\n" - "vfmadd231ps ymm3,ymm14,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+48]\t\n" - "vfmadd231ps ymm4,ymm14,ymm8\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+52]\t\n" - "vfmadd231ps ymm5,ymm14,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+56]\t\n" - "vfmadd231ps ymm6,ymm14,ymm8\t\n" - "vbroadcastss ymm8,DWORD PTR [r9+60]\t\n" - "vfmadd231ps ymm7,ymm14,ymm8\t\n" - "add r9,64\t\n" - "cmp r14, r8\t\n" - "jl loop_inner%=\t\n" - - "L_exit%=:\t\n" - "add r10, rsi\t\n" - - "cmp rdx, 1\t\n" - "je L_accum%=\t\n" - // Dump C - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - "jmp L_done%=\t\n" - - "L_accum%=:\t\n" - // Dump C with accumulate - "vbroadcastss ymm15,DWORD PTR [r15]\t\n" - "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - - "L_done%=:\t\n" - - // next outer iteration - "add rcx, 32\t\n" - "mov r12, rcx\t\n" - "mov r9, rax\t\n" - "inc rbx\t\n" - "cmp rbx, rdi\t\n" - "jl loop_outter%=\t\n" - : - : [gp] "rm"(gp) - : "r8", - "r9", - "r10", - "r11", - "r15", - "r13", - "r14", - "rax", - "rcx", - "rdx", - "rsi", - "rdi", - "rbx", - "r12", - "memory"); -} -void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp) { +void __attribute__((noinline)) gemmkernel_1x2_AVX2_fA0fB0fC0(GemmParams* gp) { asm volatile( #if !defined(__clang__) "mov r14, %[gp]\t\n" @@ -1133,89 +45,28 @@ void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp) { "mov r14, 0\t\n" "vxorps ymm0,ymm0,ymm0\t\n" "vxorps ymm1,ymm1,ymm1\t\n" - "vxorps ymm2,ymm2,ymm2\t\n" - "vxorps ymm3,ymm3,ymm3\t\n" - "vxorps ymm4,ymm4,ymm4\t\n" - "vxorps ymm5,ymm5,ymm5\t\n" - "vxorps ymm6,ymm6,ymm6\t\n" - "vxorps ymm7,ymm7,ymm7\t\n" - "vxorps ymm8,ymm8,ymm8\t\n" - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" "loop_inner%=:\t\n" - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm5,ymm15,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm6,ymm15,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm7,ymm15,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm8,ymm15,ymm9\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" + "vcvtph2ps ymm3,XMMWORD PTR [r10 + 0]\t\n" + "vcvtph2ps ymm4,XMMWORD PTR [r10 + 16]\t\n" + "vbroadcastss ymm2,DWORD PTR [r9+0]\t\n" + "vfmadd231ps ymm0,ymm3,ymm2\t\n" + "vfmadd231ps ymm1,ymm4,ymm2\t\n" + "add r9,4\t\n" + "add r10,32\t\n" "inc r14\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm0,ymm14,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+40]\t\n" - "vfmadd231ps ymm1,ymm14,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+44]\t\n" - "vfmadd231ps ymm2,ymm14,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+48]\t\n" - "vfmadd231ps ymm3,ymm14,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+52]\t\n" - "vfmadd231ps ymm4,ymm14,ymm9\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+56]\t\n" - "vfmadd231ps ymm5,ymm14,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+60]\t\n" - "vfmadd231ps ymm6,ymm14,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+64]\t\n" - "vfmadd231ps ymm7,ymm14,ymm9\t\n" - "vbroadcastss ymm9,DWORD PTR [r9+68]\t\n" - "vfmadd231ps ymm8,ymm14,ymm9\t\n" - "add r9,72\t\n" "cmp r14, r8\t\n" "jl loop_inner%=\t\n" "L_exit%=:\t\n" - "add r10, rsi\t\n" "cmp rdx, 1\t\n" "je L_accum%=\t\n" // Dump C "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "jmp L_done%=\t\n" @@ -1224,36 +75,14 @@ void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vbroadcastss ymm15,DWORD PTR [r15]\t\n" "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" + "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "L_done%=:\t\n" // next outer iteration - "add rcx, 32\t\n" + "add rcx, 64\t\n" "mov r12, rcx\t\n" "mov r9, rax\t\n" "inc rbx\t\n" @@ -1277,7 +106,7 @@ void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp) { "r12", "memory"); } -void __attribute__((noinline)) gemmkernel_10x1_AVX2_fA0fB0fC0(GemmParams* gp) { +void __attribute__((noinline)) gemmkernel_2x2_AVX2_fA0fB0fC0(GemmParams* gp) { asm volatile( #if !defined(__clang__) "mov r14, %[gp]\t\n" @@ -1316,94 +145,34 @@ void __attribute__((noinline)) gemmkernel_10x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vxorps ymm1,ymm1,ymm1\t\n" "vxorps ymm2,ymm2,ymm2\t\n" "vxorps ymm3,ymm3,ymm3\t\n" - "vxorps ymm4,ymm4,ymm4\t\n" - "vxorps ymm5,ymm5,ymm5\t\n" - "vxorps ymm6,ymm6,ymm6\t\n" - "vxorps ymm7,ymm7,ymm7\t\n" - "vxorps ymm8,ymm8,ymm8\t\n" - "vxorps ymm9,ymm9,ymm9\t\n" - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" "loop_inner%=:\t\n" - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm5,ymm15,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm6,ymm15,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm7,ymm15,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm8,ymm15,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm9,ymm15,ymm10\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" + "vcvtph2ps ymm5,XMMWORD PTR [r10 + 0]\t\n" + "vcvtph2ps ymm6,XMMWORD PTR [r10 + 16]\t\n" + "vbroadcastss ymm4,DWORD PTR [r9+0]\t\n" + "vfmadd231ps ymm0,ymm5,ymm4\t\n" + "vfmadd231ps ymm1,ymm6,ymm4\t\n" + "vbroadcastss ymm4,DWORD PTR [r9+4]\t\n" + "vfmadd231ps ymm2,ymm5,ymm4\t\n" + "vfmadd231ps ymm3,ymm6,ymm4\t\n" + "add r9,8\t\n" + "add r10,32\t\n" "inc r14\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+40]\t\n" - "vfmadd231ps ymm0,ymm14,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+44]\t\n" - "vfmadd231ps ymm1,ymm14,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+48]\t\n" - "vfmadd231ps ymm2,ymm14,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+52]\t\n" - "vfmadd231ps ymm3,ymm14,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+56]\t\n" - "vfmadd231ps ymm4,ymm14,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+60]\t\n" - "vfmadd231ps ymm5,ymm14,ymm10\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+64]\t\n" - "vfmadd231ps ymm6,ymm14,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+68]\t\n" - "vfmadd231ps ymm7,ymm14,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+72]\t\n" - "vfmadd231ps ymm8,ymm14,ymm10\t\n" - "vbroadcastss ymm10,DWORD PTR [r9+76]\t\n" - "vfmadd231ps ymm9,ymm14,ymm10\t\n" - "add r9,80\t\n" "cmp r14, r8\t\n" "jl loop_inner%=\t\n" "L_exit%=:\t\n" - "add r10, rsi\t\n" "cmp rdx, 1\t\n" "je L_accum%=\t\n" // Dump C "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "jmp L_done%=\t\n" @@ -1412,39 +181,19 @@ void __attribute__((noinline)) gemmkernel_10x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vbroadcastss ymm15,DWORD PTR [r15]\t\n" "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" + "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "L_done%=:\t\n" // next outer iteration - "add rcx, 32\t\n" + "add rcx, 64\t\n" "mov r12, rcx\t\n" "mov r9, rax\t\n" "inc rbx\t\n" @@ -1468,7 +217,7 @@ void __attribute__((noinline)) gemmkernel_10x1_AVX2_fA0fB0fC0(GemmParams* gp) { "r12", "memory"); } -void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp) { +void __attribute__((noinline)) gemmkernel_3x2_AVX2_fA0fB0fC0(GemmParams* gp) { asm volatile( #if !defined(__clang__) "mov r14, %[gp]\t\n" @@ -1509,99 +258,40 @@ void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vxorps ymm3,ymm3,ymm3\t\n" "vxorps ymm4,ymm4,ymm4\t\n" "vxorps ymm5,ymm5,ymm5\t\n" - "vxorps ymm6,ymm6,ymm6\t\n" - "vxorps ymm7,ymm7,ymm7\t\n" - "vxorps ymm8,ymm8,ymm8\t\n" - "vxorps ymm9,ymm9,ymm9\t\n" - "vxorps ymm10,ymm10,ymm10\t\n" - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" "loop_inner%=:\t\n" - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm5,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm6,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm7,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm8,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm9,ymm15,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+40]\t\n" - "vfmadd231ps ymm10,ymm15,ymm11\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" + "vcvtph2ps ymm7,XMMWORD PTR [r10 + 0]\t\n" + "vcvtph2ps ymm8,XMMWORD PTR [r10 + 16]\t\n" + "vbroadcastss ymm6,DWORD PTR [r9+0]\t\n" + "vfmadd231ps ymm0,ymm7,ymm6\t\n" + "vfmadd231ps ymm1,ymm8,ymm6\t\n" + "vbroadcastss ymm6,DWORD PTR [r9+4]\t\n" + "vfmadd231ps ymm2,ymm7,ymm6\t\n" + "vfmadd231ps ymm3,ymm8,ymm6\t\n" + "vbroadcastss ymm6,DWORD PTR [r9+8]\t\n" + "vfmadd231ps ymm4,ymm7,ymm6\t\n" + "vfmadd231ps ymm5,ymm8,ymm6\t\n" + "add r9,12\t\n" + "add r10,32\t\n" "inc r14\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+44]\t\n" - "vfmadd231ps ymm0,ymm14,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+48]\t\n" - "vfmadd231ps ymm1,ymm14,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+52]\t\n" - "vfmadd231ps ymm2,ymm14,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+56]\t\n" - "vfmadd231ps ymm3,ymm14,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+60]\t\n" - "vfmadd231ps ymm4,ymm14,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+64]\t\n" - "vfmadd231ps ymm5,ymm14,ymm11\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+68]\t\n" - "vfmadd231ps ymm6,ymm14,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+72]\t\n" - "vfmadd231ps ymm7,ymm14,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+76]\t\n" - "vfmadd231ps ymm8,ymm14,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+80]\t\n" - "vfmadd231ps ymm9,ymm14,ymm11\t\n" - "vbroadcastss ymm11,DWORD PTR [r9+84]\t\n" - "vfmadd231ps ymm10,ymm14,ymm11\t\n" - "add r9,88\t\n" "cmp r14, r8\t\n" "jl loop_inner%=\t\n" "L_exit%=:\t\n" - "add r10, rsi\t\n" "cmp rdx, 1\t\n" "je L_accum%=\t\n" // Dump C "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n" "add r12, r13\t\n" "jmp L_done%=\t\n" @@ -1610,42 +300,24 @@ void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vbroadcastss ymm15,DWORD PTR [r15]\t\n" "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" + "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm10,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n" + "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n" "add r12, r13\t\n" "L_done%=:\t\n" // next outer iteration - "add rcx, 32\t\n" + "add rcx, 64\t\n" "mov r12, rcx\t\n" "mov r9, rax\t\n" "inc rbx\t\n" @@ -1669,7 +341,7 @@ void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp) { "r12", "memory"); } -void __attribute__((noinline)) gemmkernel_12x1_AVX2_fA0fB0fC0(GemmParams* gp) { +void __attribute__((noinline)) gemmkernel_4x2_AVX2_fA0fB0fC0(GemmParams* gp) { asm volatile( #if !defined(__clang__) "mov r14, %[gp]\t\n" @@ -1712,104 +384,46 @@ void __attribute__((noinline)) gemmkernel_12x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vxorps ymm5,ymm5,ymm5\t\n" "vxorps ymm6,ymm6,ymm6\t\n" "vxorps ymm7,ymm7,ymm7\t\n" - "vxorps ymm8,ymm8,ymm8\t\n" - "vxorps ymm9,ymm9,ymm9\t\n" - "vxorps ymm10,ymm10,ymm10\t\n" - "vxorps ymm11,ymm11,ymm11\t\n" - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" "loop_inner%=:\t\n" - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm5,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm6,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm7,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm8,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm9,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+40]\t\n" - "vfmadd231ps ymm10,ymm15,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+44]\t\n" - "vfmadd231ps ymm11,ymm15,ymm12\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" + "vcvtph2ps ymm9,XMMWORD PTR [r10 + 0]\t\n" + "vcvtph2ps ymm10,XMMWORD PTR [r10 + 16]\t\n" + "vbroadcastss ymm8,DWORD PTR [r9+0]\t\n" + "vfmadd231ps ymm0,ymm9,ymm8\t\n" + "vfmadd231ps ymm1,ymm10,ymm8\t\n" + "vbroadcastss ymm8,DWORD PTR [r9+4]\t\n" + "vfmadd231ps ymm2,ymm9,ymm8\t\n" + "vfmadd231ps ymm3,ymm10,ymm8\t\n" + "vbroadcastss ymm8,DWORD PTR [r9+8]\t\n" + "vfmadd231ps ymm4,ymm9,ymm8\t\n" + "vfmadd231ps ymm5,ymm10,ymm8\t\n" + "vbroadcastss ymm8,DWORD PTR [r9+12]\t\n" + "vfmadd231ps ymm6,ymm9,ymm8\t\n" + "vfmadd231ps ymm7,ymm10,ymm8\t\n" + "add r9,16\t\n" + "add r10,32\t\n" "inc r14\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+48]\t\n" - "vfmadd231ps ymm0,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+52]\t\n" - "vfmadd231ps ymm1,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+56]\t\n" - "vfmadd231ps ymm2,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+60]\t\n" - "vfmadd231ps ymm3,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+64]\t\n" - "vfmadd231ps ymm4,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+68]\t\n" - "vfmadd231ps ymm5,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+72]\t\n" - "vfmadd231ps ymm6,ymm14,ymm12\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+76]\t\n" - "vfmadd231ps ymm7,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+80]\t\n" - "vfmadd231ps ymm8,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+84]\t\n" - "vfmadd231ps ymm9,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+88]\t\n" - "vfmadd231ps ymm10,ymm14,ymm12\t\n" - "vbroadcastss ymm12,DWORD PTR [r9+92]\t\n" - "vfmadd231ps ymm11,ymm14,ymm12\t\n" - "add r9,96\t\n" "cmp r14, r8\t\n" "jl loop_inner%=\t\n" "L_exit%=:\t\n" - "add r10, rsi\t\n" "cmp rdx, 1\t\n" "je L_accum%=\t\n" // Dump C "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n" "add r12, r13\t\n" "jmp L_done%=\t\n" @@ -1818,45 +432,29 @@ void __attribute__((noinline)) gemmkernel_12x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vbroadcastss ymm15,DWORD PTR [r15]\t\n" "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" + "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" + "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n" "add r12, r13\t\n" "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm10,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm11,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n" + "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n" "add r12, r13\t\n" "L_done%=:\t\n" // next outer iteration - "add rcx, 32\t\n" + "add rcx, 64\t\n" "mov r12, rcx\t\n" "mov r9, rax\t\n" "inc rbx\t\n" @@ -1880,7 +478,7 @@ void __attribute__((noinline)) gemmkernel_12x1_AVX2_fA0fB0fC0(GemmParams* gp) { "r12", "memory"); } -void __attribute__((noinline)) gemmkernel_13x1_AVX2_fA0fB0fC0(GemmParams* gp) { +void __attribute__((noinline)) gemmkernel_5x2_AVX2_fA0fB0fC0(GemmParams* gp) { asm volatile( #if !defined(__clang__) "mov r14, %[gp]\t\n" @@ -1925,109 +523,52 @@ void __attribute__((noinline)) gemmkernel_13x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vxorps ymm7,ymm7,ymm7\t\n" "vxorps ymm8,ymm8,ymm8\t\n" "vxorps ymm9,ymm9,ymm9\t\n" - "vxorps ymm10,ymm10,ymm10\t\n" - "vxorps ymm11,ymm11,ymm11\t\n" - "vxorps ymm12,ymm12,ymm12\t\n" - "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n" - "mov r11, 16\t\n" "loop_inner%=:\t\n" - "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n" - "inc r14\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm5,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm6,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm7,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm8,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm9,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+40]\t\n" - "vfmadd231ps ymm10,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+44]\t\n" - "vfmadd231ps ymm11,ymm15,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+48]\t\n" - "vfmadd231ps ymm12,ymm15,ymm13\t\n" - "cmp r14, r8\t\n" - "jge L_exit%=\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n" + "vcvtph2ps ymm11,XMMWORD PTR [r10 + 0]\t\n" + "vcvtph2ps ymm12,XMMWORD PTR [r10 + 16]\t\n" + "vbroadcastss ymm10,DWORD PTR [r9+0]\t\n" + "vfmadd231ps ymm0,ymm11,ymm10\t\n" + "vfmadd231ps ymm1,ymm12,ymm10\t\n" + "vbroadcastss ymm10,DWORD PTR [r9+4]\t\n" + "vfmadd231ps ymm2,ymm11,ymm10\t\n" + "vfmadd231ps ymm3,ymm12,ymm10\t\n" + "vbroadcastss ymm10,DWORD PTR [r9+8]\t\n" + "vfmadd231ps ymm4,ymm11,ymm10\t\n" + "vfmadd231ps ymm5,ymm12,ymm10\t\n" + "vbroadcastss ymm10,DWORD PTR [r9+12]\t\n" + "vfmadd231ps ymm6,ymm11,ymm10\t\n" + "vfmadd231ps ymm7,ymm12,ymm10\t\n" + "vbroadcastss ymm10,DWORD PTR [r9+16]\t\n" + "vfmadd231ps ymm8,ymm11,ymm10\t\n" + "vfmadd231ps ymm9,ymm12,ymm10\t\n" + "add r9,20\t\n" + "add r10,32\t\n" "inc r14\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+52]\t\n" - "vfmadd231ps ymm0,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+56]\t\n" - "vfmadd231ps ymm1,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+60]\t\n" - "vfmadd231ps ymm2,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+64]\t\n" - "vfmadd231ps ymm3,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+68]\t\n" - "vfmadd231ps ymm4,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+72]\t\n" - "vfmadd231ps ymm5,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+76]\t\n" - "vfmadd231ps ymm6,ymm14,ymm13\t\n" - "add r11, 32\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+80]\t\n" - "vfmadd231ps ymm7,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+84]\t\n" - "vfmadd231ps ymm8,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+88]\t\n" - "vfmadd231ps ymm9,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+92]\t\n" - "vfmadd231ps ymm10,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+96]\t\n" - "vfmadd231ps ymm11,ymm14,ymm13\t\n" - "vbroadcastss ymm13,DWORD PTR [r9+100]\t\n" - "vfmadd231ps ymm12,ymm14,ymm13\t\n" - "add r9,104\t\n" "cmp r14, r8\t\n" "jl loop_inner%=\t\n" "L_exit%=:\t\n" - "add r10, rsi\t\n" "cmp rdx, 1\t\n" "je L_accum%=\t\n" // Dump C "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm12\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm9\t\n" "add r12, r13\t\n" "jmp L_done%=\t\n" @@ -2036,48 +577,34 @@ void __attribute__((noinline)) gemmkernel_13x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vbroadcastss ymm15,DWORD PTR [r15]\t\n" "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" + "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" + "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n" "add r12, r13\t\n" "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" + "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n" "add r12, r13\t\n" "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm10,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm11,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm12,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm12\t\n" + "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm9\t\n" "add r12, r13\t\n" "L_done%=:\t\n" // next outer iteration - "add rcx, 32\t\n" + "add rcx, 64\t\n" "mov r12, rcx\t\n" "mov r9, rax\t\n" "inc rbx\t\n" @@ -2101,7 +628,7 @@ void __attribute__((noinline)) gemmkernel_13x1_AVX2_fA0fB0fC0(GemmParams* gp) { "r12", "memory"); } -void __attribute__((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams* gp) { +void __attribute__((noinline)) gemmkernel_6x2_AVX2_fA0fB0fC0(GemmParams* gp) { asm volatile( #if !defined(__clang__) "mov r14, %[gp]\t\n" @@ -2148,79 +675,58 @@ void __attribute__((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vxorps ymm9,ymm9,ymm9\t\n" "vxorps ymm10,ymm10,ymm10\t\n" "vxorps ymm11,ymm11,ymm11\t\n" - "vxorps ymm12,ymm12,ymm12\t\n" - "vxorps ymm13,ymm13,ymm13\t\n" - "mov r11, 0\t\n" "loop_inner%=:\t\n" - "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11]\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+0]\t\n" - "vfmadd231ps ymm0,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+4]\t\n" - "vfmadd231ps ymm1,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+8]\t\n" - "vfmadd231ps ymm2,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+12]\t\n" - "vfmadd231ps ymm3,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+16]\t\n" - "vfmadd231ps ymm4,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+20]\t\n" - "vfmadd231ps ymm5,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+24]\t\n" - "vfmadd231ps ymm6,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+28]\t\n" - "vfmadd231ps ymm7,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+32]\t\n" - "vfmadd231ps ymm8,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+36]\t\n" - "vfmadd231ps ymm9,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+40]\t\n" - "vfmadd231ps ymm10,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+44]\t\n" - "vfmadd231ps ymm11,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+48]\t\n" - "vfmadd231ps ymm12,ymm15,ymm14\t\n" - "vbroadcastss ymm14,DWORD PTR [r9+52]\t\n" - "vfmadd231ps ymm13,ymm15,ymm14\t\n" - "add r9,56\t\n" - "add r11, 16\t\n" + "vcvtph2ps ymm13,XMMWORD PTR [r10 + 0]\t\n" + "vcvtph2ps ymm14,XMMWORD PTR [r10 + 16]\t\n" + "vbroadcastss ymm12,DWORD PTR [r9+0]\t\n" + "vfmadd231ps ymm0,ymm13,ymm12\t\n" + "vfmadd231ps ymm1,ymm14,ymm12\t\n" + "vbroadcastss ymm12,DWORD PTR [r9+4]\t\n" + "vfmadd231ps ymm2,ymm13,ymm12\t\n" + "vfmadd231ps ymm3,ymm14,ymm12\t\n" + "vbroadcastss ymm12,DWORD PTR [r9+8]\t\n" + "vfmadd231ps ymm4,ymm13,ymm12\t\n" + "vfmadd231ps ymm5,ymm14,ymm12\t\n" + "vbroadcastss ymm12,DWORD PTR [r9+12]\t\n" + "vfmadd231ps ymm6,ymm13,ymm12\t\n" + "vfmadd231ps ymm7,ymm14,ymm12\t\n" + "vbroadcastss ymm12,DWORD PTR [r9+16]\t\n" + "vfmadd231ps ymm8,ymm13,ymm12\t\n" + "vfmadd231ps ymm9,ymm14,ymm12\t\n" + "vbroadcastss ymm12,DWORD PTR [r9+20]\t\n" + "vfmadd231ps ymm10,ymm13,ymm12\t\n" + "vfmadd231ps ymm11,ymm14,ymm12\t\n" + "add r9,24\t\n" + "add r10,32\t\n" "inc r14\t\n" "cmp r14, r8\t\n" "jl loop_inner%=\t\n" - "add r10, rsi\t\n" + + "L_exit%=:\t\n" "cmp rdx, 1\t\n" "je L_accum%=\t\n" // Dump C "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm9\t\n" "add r12, r13\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm12\t\n" - "add r12, r13\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm13\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm11\t\n" "add r12, r13\t\n" "jmp L_done%=\t\n" @@ -2229,51 +735,39 @@ void __attribute__((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams* gp) { "vbroadcastss ymm15,DWORD PTR [r15]\t\n" "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n" + "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n" "add r12, r13\t\n" "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n" + "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n" "add r12, r13\t\n" "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n" + "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n" "add r12, r13\t\n" "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n" + "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n" "add r12, r13\t\n" "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n" + "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm9\t\n" "add r12, r13\t\n" "vfmadd231ps ymm10,ymm15,YMMWORD PTR [r12 + 0]\t\n" "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm11,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm12,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm12\t\n" - "add r12, r13\t\n" - "vfmadd231ps ymm13,ymm15,YMMWORD PTR [r12 + 0]\t\n" - "vmovups YMMWORD PTR [r12 + 0], ymm13\t\n" + "vfmadd231ps ymm11,ymm15,YMMWORD PTR [r12 + 32]\t\n" + "vmovups YMMWORD PTR [r12 + 32], ymm11\t\n" "add r12, r13\t\n" "L_done%=:\t\n" // next outer iteration - "add rcx, 32\t\n" + "add rcx, 64\t\n" "mov r12, rcx\t\n" "mov r9, rax\t\n" "inc rbx\t\n" diff --git a/src/FbgemmFP16UKernelsAvx2.h b/src/FbgemmFP16UKernelsAvx2.h index 4053332..6e7dfbc 100644 --- a/src/FbgemmFP16UKernelsAvx2.h +++ b/src/FbgemmFP16UKernelsAvx2.h @@ -24,20 +24,12 @@ struct GemmParams { uint64_t b_block_cols; uint64_t b_block_size; }; -void __attribute__((noinline)) gemmkernel_1x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_2x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_3x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_4x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_5x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_6x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_7x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_8x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_10x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_12x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_13x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_1x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_2x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_3x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_4x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_5x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_6x2_AVX2_fA0fB0fC0(GemmParams* gp); typedef void (*funcptr_fp16)(GemmParams* gp); ; diff --git a/src/codegen_fp16fp32.cc b/src/codegen_fp16fp32.cc index 17bb113..7c8e10c 100644 --- a/src/codegen_fp16fp32.cc +++ b/src/codegen_fp16fp32.cc @@ -47,20 +47,35 @@ int main() { {2, "AVX2", { - {1, 1, 0}, - {2, 1, 0}, - {3, 1, 0}, - {4, 1, 0}, - {5, 1, 0}, - {6, 1, 0}, - {7, 1, 0}, - {8, 1, 0}, - {9, 1, 0}, - {10, 1, 0}, - {11, 1, 0}, - {12, 1, 0}, - {13, 1, 0}, - {14, 1, 0}, + // 4x3 register layout + // {1, 3, 0}, + // {2, 3, 0}, + // {3, 3, 0}, + // {4, 3, 0}, + + // 6x2 register layout + {1, 2, 0}, + {2, 2, 0}, + {3, 2, 0}, + {4, 2, 0}, + {5, 2, 0}, + {6, 2, 0}, + + // 14x1 register layout + // {1, 1, 0}, + // {2, 1, 0}, + // {3, 1, 0}, + // {4, 1, 0}, + // {5, 1, 0}, + // {6, 1, 0}, + // {7, 1, 0}, + // {8, 1, 0}, + // {9, 1, 0}, + // {10, 1, 0}, + // {11, 1, 0}, + // {12, 1, 0}, + // {13, 1, 0}, + // {14, 1, 0}, }}}; // open all files @@ -159,7 +174,6 @@ int main() { string vAtmp = "ymm" + to_string(last_free_ymmreg++); // produce register block of B col - assert(ukernel_shape[k][1] == 1); vector vBcol(ukernel_shape[k][1]); for (auto c = 0; c < ukernel_shape[k][1]; c++) { @@ -228,82 +242,50 @@ int main() { srcfile << "\n"; - if (ukernel_shape[k][0] <= 13) { - addi(srcfile, "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]"); - addi(srcfile, "mov r11, 16"); - } else { - addi(srcfile, "mov r11, 0"); - } - srcfile << "\n"; string label = "loop_inner%="; addi(srcfile, label + ":"); srcfile << "\n"; - if (ukernel_shape[k][0] <= 13) { - auto a_offset = 0, unroll_factor = 2; - for (auto u = 0; u < unroll_factor; u++) { - string breg = (u == 0) ? "ymm14" : "ymm15"; - string breg_rev = (u == 0) ? "ymm15" : "ymm14"; - - addi( - srcfile, - "vcvtph2ps " + breg + ",XMMWORD PTR [r10 + r11 + " + - to_string(u * 16) + "]"); - addi(srcfile, "inc r14"); - for (auto r = 0; r < vCtile.size(); r++) { - addi( - srcfile, - "vbroadcastss " + vAtmp + ",DWORD PTR [r9+" + - to_string(a_offset) + "]"); - addi( - srcfile, - "vfmadd231ps " + vCtile[r][0] + "," + breg_rev + "," + - vAtmp); - if (u == 1 && r == vCtile.size() / 2) - addi(srcfile, "add r11, 32"); - a_offset += 4; - } - if (u < unroll_factor - 1) { - addi(srcfile, "cmp r14, r8"); - addi(srcfile, "jge " + exitlabel); - } - } - - addi(srcfile, "add r9," + to_string(a_offset)); - addi(srcfile, "cmp r14, r8"); - addi(srcfile, "jl " + label); - - srcfile << "\n"; + for (int c = 0; c < vCtile[0].size(); c++) { + addi( + srcfile, + "vcvtph2ps " + vBcol[c] + ",XMMWORD PTR [r10 + " + + to_string(16 * c) + "]"); + } - addi(srcfile, exitlabel + ":"); - } else { + for (int r = 0; r < vCtile.size(); r++) { addi( srcfile, - "vcvtph2ps " + vBcol[0] + ",XMMWORD PTR [r10 + r11]"); - for (auto r = 0; r < vCtile.size(); r++) { + "vbroadcastss " + vAtmp + ",DWORD PTR [r9+" + + to_string(4 * r) + "]"); + for (int c = 0; c < vCtile[0].size(); c++) { addi( srcfile, - "vbroadcastss " + vAtmp + ",DWORD PTR [r9+" + - to_string(4 * r) + "]"); - addi( - srcfile, - "vfmadd231ps " + vCtile[r][0] + "," + vBcol[0] + "," + + "vfmadd231ps " + vCtile[r][c] + "," + vBcol[c] + "," + vAtmp); } + } - addi( - srcfile, - "add r9," + to_string(4 * ukernel_shape[k][0]), - fixedA); // move A ptr - addi(srcfile, "add r11, 16"); + addi( + srcfile, + "add r9," + to_string(4 * ukernel_shape[k][0]), + fixedA); // move A ptr - addi(srcfile, "inc r14"); - addi(srcfile, "cmp r14, r8"); - addi(srcfile, "jl " + label); - } + addi( + srcfile, + "add r10," + to_string(16 * ukernel_shape[k][1]), + fixedA); // move A ptr + + addi(srcfile, "inc r14"); + addi(srcfile, "cmp r14, r8"); + addi(srcfile, "jl " + label); + + srcfile << "\n"; + + addi(srcfile, exitlabel + ":"); - addi(srcfile, "add r10, rsi"); + // addi(srcfile, "add r10, rsi"); srcfile << "\n"; // end marker -- cgit v1.2.3