Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJianyu Huang <jianyuhuang@fb.com>2019-03-08 05:02:55 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-03-08 05:05:39 +0300
commit844dacc267391cd2a725d81c2495636f0765771b (patch)
treecb20d367ce086dfb4374a7794fa8d889ffead09b /src
parent66b41357561f2ff9895d2b4638273f07c49dbe29 (diff)
Fixes for FBGEMM FP16 performance (#82)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/82 This is a quick fix for matching FBGEMM FP16 performance with SKINNY GEMM FP16. Basically, this Diff switches the register layout in C accumulation buffer inside micro-kernel from MR * 1 to MR * 2. Check the reasons in T40816746. Reviewed By: zhengwy888 Differential Revision: D14278430 fbshipit-source-id: 961dd681deee69e2b7fec6bcdba7920e0b09134a
Diffstat (limited to 'src')
-rw-r--r--src/FbgemmFP16.cc296
-rw-r--r--src/FbgemmFP16UKernelsAvx2.cc1786
-rw-r--r--src/FbgemmFP16UKernelsAvx2.h20
-rw-r--r--src/codegen_fp16fp32.cc134
4 files changed, 350 insertions, 1886 deletions
diff --git a/src/FbgemmFP16.cc b/src/FbgemmFP16.cc
index d3d5c1f..868bc1b 100644
--- a/src/FbgemmFP16.cc
+++ b/src/FbgemmFP16.cc
@@ -35,23 +35,18 @@ inline void PackA(int nrow, int ncol, const float* from, int ldim, float* to) {
struct KernelInfo {
using knl_ptr = funcptr_fp16;
// optimized kernels to cover all cases
- static constexpr array<knl_ptr, 15> kernel = {
+ // 2 in ?x2 should be the same as kernel_ncol_blocks.
+ // Here with kernel_ncol_blocks = 2, we can provide up to 6x2 kernels, due to
+ // the restrictions of ymm register numbers (16).
+ static constexpr array<knl_ptr, 7> kernel = {
{
nullptr,
- gemmkernel_1x1_AVX2_fA0fB0fC0,
- gemmkernel_2x1_AVX2_fA0fB0fC0,
- gemmkernel_3x1_AVX2_fA0fB0fC0,
- gemmkernel_4x1_AVX2_fA0fB0fC0,
- gemmkernel_5x1_AVX2_fA0fB0fC0,
- gemmkernel_6x1_AVX2_fA0fB0fC0,
- gemmkernel_7x1_AVX2_fA0fB0fC0,
- gemmkernel_8x1_AVX2_fA0fB0fC0,
- gemmkernel_9x1_AVX2_fA0fB0fC0,
- gemmkernel_10x1_AVX2_fA0fB0fC0,
- gemmkernel_11x1_AVX2_fA0fB0fC0,
- gemmkernel_12x1_AVX2_fA0fB0fC0,
- gemmkernel_13x1_AVX2_fA0fB0fC0,
- gemmkernel_14x1_AVX2_fA0fB0fC0
+ gemmkernel_1x2_AVX2_fA0fB0fC0,
+ gemmkernel_2x2_AVX2_fA0fB0fC0,
+ gemmkernel_3x2_AVX2_fA0fB0fC0,
+ gemmkernel_4x2_AVX2_fA0fB0fC0,
+ gemmkernel_5x2_AVX2_fA0fB0fC0,
+ gemmkernel_6x2_AVX2_fA0fB0fC0
}
};
@@ -61,131 +56,131 @@ struct KernelInfo {
// NOTE: clang-format wants to use a different formatting but the current
// formatting should be easier to read.
{
- {{ { 0, 0 }, { 0, 0 } } },
- {{ { 1, 1 }, { 0, 0 } } },
- {{ { 2, 1 }, { 0, 0 } } },
- {{ { 3, 1 }, { 0, 0 } } },
- {{ { 4, 1 }, { 0, 0 } } },
- {{ { 5, 1 }, { 0, 0 } } },
- {{ { 6, 1 }, { 0, 0 } } },
- {{ { 7, 1 }, { 0, 0 } } },
- {{ { 8, 1 }, { 0, 0 } } },
- {{ { 9, 1 }, { 0, 0 } } },
- {{ { 10, 1 }, { 0, 0 } } },
- {{ { 11, 1 }, { 0, 0 } } },
- {{ { 12, 1 }, { 0, 0 } } },
- {{ { 13, 1 }, { 0, 0 } } },
- {{ { 14, 1 }, { 0, 0 } } },
- {{ { 8, 1 }, { 7, 1 } } },
- {{ { 10, 1 }, { 6, 1 } } },
- {{ { 11, 1 }, { 6, 1 } } },
- {{ { 12, 1 }, { 6, 1 } } },
- {{ { 11, 1 }, { 8, 1 } } },
- {{ { 11, 1 }, { 9, 1 } } },
- {{ { 12, 1 }, { 9, 1 } } },
- {{ { 11, 2 }, { 0, 0 } } },
- {{ { 12, 1 }, { 11, 1 } } },
- {{ { 12, 2 }, { 0, 0 } } },
- {{ { 13, 1 }, { 12, 1 } } },
- {{ { 13, 2 }, { 0, 0 } } },
- {{ { 14, 1 }, { 13, 1 } } },
- {{ { 14, 2 }, { 0, 0 } } },
- {{ { 11, 2 }, { 7, 1 } } },
- {{ { 10, 3 }, { 0, 0 } } },
- {{ { 12, 2 }, { 7, 1 } } },
- {{ { 12, 2 }, { 8, 1 } } },
- {{ { 11, 3 }, { 0, 0 } } },
- {{ { 13, 2 }, { 8, 1 } } },
- {{ { 13, 2 }, { 9, 1 } } },
- {{ { 13, 2 }, { 10, 1 } } },
- {{ { 13, 2 }, { 11, 1 } } },
- {{ { 13, 2 }, { 12, 1 } } },
- {{ { 13, 3 }, { 0, 0 } } },
- {{ { 14, 2 }, { 12, 1 } } },
- {{ { 14, 2 }, { 13, 1 } } },
- {{ { 11, 3 }, { 9, 1 } } },
- {{ { 11, 3 }, { 10, 1 } } },
- {{ { 11, 4 }, { 0, 0 } } },
- {{ { 12, 3 }, { 9, 1 } } },
- {{ { 12, 3 }, { 10, 1 } } },
- {{ { 13, 3 }, { 8, 1 } } },
- {{ { 13, 3 }, { 9, 1 } } },
- {{ { 13, 3 }, { 10, 1 } } },
- {{ { 13, 3 }, { 11, 1 } } },
- {{ { 13, 3 }, { 12, 1 } } },
- {{ { 13, 4 }, { 0, 0 } } },
- {{ { 14, 3 }, { 11, 1 } } },
- {{ { 11, 4 }, { 10, 1 } } },
- {{ { 12, 4 }, { 7, 1 } } },
- {{ { 14, 4 }, { 0, 0 } } },
- {{ { 12, 4 }, { 9, 1 } } },
- {{ { 12, 4 }, { 10, 1 } } },
- {{ { 12, 4 }, { 11, 1 } } },
- {{ { 13, 4 }, { 8, 1 } } },
- {{ { 13, 4 }, { 9, 1 } } },
- {{ { 13, 4 }, { 10, 1 } } },
- {{ { 13, 4 }, { 11, 1 } } },
- {{ { 11, 5 }, { 9, 1 } } },
- {{ { 13, 5 }, { 0, 0 } } },
- {{ { 14, 4 }, { 10, 1 } } },
- {{ { 12, 5 }, { 7, 1 } } },
- {{ { 12, 5 }, { 8, 1 } } },
- {{ { 14, 4 }, { 13, 1 } } },
- {{ { 14, 5 }, { 0, 0 } } },
- {{ { 12, 5 }, { 11, 1 } } },
- {{ { 13, 5 }, { 7, 1 } } },
- {{ { 11, 6 }, { 7, 1 } } },
- {{ { 13, 5 }, { 9, 1 } } },
- {{ { 13, 5 }, { 10, 1 } } },
- {{ { 13, 5 }, { 11, 1 } } },
- {{ { 13, 5 }, { 12, 1 } } },
- {{ { 13, 6 }, { 0, 0 } } },
- {{ { 12, 6 }, { 7, 1 } } },
- {{ { 12, 6 }, { 8, 1 } } },
- {{ { 12, 6 }, { 9, 1 } } },
- {{ { 12, 6 }, { 10, 1 } } },
- {{ { 12, 6 }, { 11, 1 } } },
- {{ { 12, 7 }, { 0, 0 } } },
- {{ { 13, 6 }, { 7, 1 } } },
- {{ { 13, 6 }, { 8, 1 } } },
- {{ { 13, 6 }, { 9, 1 } } },
- {{ { 13, 6 }, { 10, 1 } } },
- {{ { 13, 6 }, { 11, 1 } } },
- {{ { 13, 6 }, { 12, 1 } } },
- {{ { 13, 7 }, { 0, 0 } } },
- {{ { 12, 7 }, { 8, 1 } } },
- {{ { 12, 7 }, { 9, 1 } } },
- {{ { 14, 6 }, { 10, 1 } } },
- {{ { 12, 7 }, { 11, 1 } } },
- {{ { 13, 7 }, { 5, 1 } } },
- {{ { 13, 7 }, { 6, 1 } } },
- {{ { 13, 7 }, { 7, 1 } } },
- {{ { 13, 7 }, { 8, 1 } } },
- {{ { 13, 7 }, { 9, 1 } } },
- {{ { 13, 7 }, { 10, 1 } } },
- {{ { 13, 7 }, { 11, 1 } } },
- {{ { 13, 7 }, { 12, 1 } } },
- {{ { 12, 8 }, { 8, 1 } } },
- {{ { 12, 8 }, { 9, 1 } } },
- {{ { 12, 8 }, { 10, 1 } } },
- {{ { 12, 8 }, { 11, 1 } } },
- {{ { 12, 9 }, { 0, 0 } } },
- {{ { 11, 9 }, { 10, 1 } } },
- {{ { 13, 8 }, { 6, 1 } } },
- {{ { 13, 8 }, { 7, 1 } } },
- {{ { 13, 8 }, { 8, 1 } } },
- {{ { 13, 8 }, { 9, 1 } } },
- {{ { 13, 8 }, { 10, 1 } } },
- {{ { 13, 8 }, { 11, 1 } } },
- {{ { 12, 9 }, { 8, 1 } } },
- {{ { 13, 9 }, { 0, 0 } } },
- {{ { 12, 9 }, { 10, 1 } } },
- {{ { 12, 9 }, { 11, 1 } } },
- {{ { 12, 10 }, { 0, 0 } } }
+ {{ { 0, 0 }, { 0, 0 } } }, // 0
+ {{ { 1, 1 }, { 0, 0 } } }, // 1
+ {{ { 2, 1 }, { 0, 0 } } }, // 2
+ {{ { 3, 1 }, { 0, 0 } } }, // 3
+ {{ { 4, 1 }, { 0, 0 } } }, // 4
+ {{ { 5, 1 }, { 0, 0 } } }, // 5
+ {{ { 6, 1 }, { 0, 0 } } }, // 6
+ {{ { 5, 1 }, { 2, 1 } } }, // 7
+ {{ { 4, 2 }, { 0, 0 } } }, // 8
+ {{ { 5, 1 }, { 4, 1 } } }, // 9
+ {{ { 5, 2 }, { 0, 0 } } }, // 10
+ {{ { 6, 1 }, { 5, 1 } } }, // 11
+ {{ { 6, 2 }, { 0, 0 } } }, // 12
+ {{ { 5, 2 }, { 3, 1 } } }, // 13
+ {{ { 6, 2 }, { 2, 1 } } }, // 14
+ {{ { 5, 3 }, { 0, 0 } } }, // 15
+ {{ { 6, 2 }, { 4, 1 } } }, // 16
+ {{ { 6, 2 }, { 5, 1 } } }, // 17
+ {{ { 6, 3 }, { 0, 0 } } }, // 18
+ {{ { 5, 3 }, { 4, 1 } } }, // 19
+ {{ { 5, 4 }, { 0, 0 } } }, // 20
+ {{ { 5, 3 }, { 6, 1 } } }, // 21
+ {{ { 6, 3 }, { 4, 1 } } }, // 22
+ {{ { 6, 3 }, { 5, 1 } } }, // 23
+ {{ { 6, 4 }, { 0, 0 } } }, // 24
+ {{ { 5, 5 }, { 0, 0 } } }, // 25
+ {{ { 5, 4 }, { 6, 1 } } }, // 26
+ {{ { 6, 4 }, { 3, 1 } } }, // 27
+ {{ { 6, 4 }, { 4, 1 } } }, // 28
+ {{ { 6, 4 }, { 5, 1 } } }, // 29
+ {{ { 6, 5 }, { 0, 0 } } }, // 30
+ {{ { 6, 5 }, { 1, 1 } } }, // 31
+ {{ { 6, 5 }, { 2, 1 } } }, // 32
+ {{ { 6, 5 }, { 3, 1 } } }, // 33
+ {{ { 6, 5 }, { 4, 1 } } }, // 34
+ {{ { 6, 5 }, { 5, 1 } } }, // 35
+ {{ { 6, 6 }, { 0, 0 } } }, // 36
+ {{ { 6, 6 }, { 1, 1 } } }, // 37
+ {{ { 6, 6 }, { 2, 1 } } }, // 38
+ {{ { 6, 6 }, { 3, 1 } } }, // 39
+ {{ { 6, 6 }, { 4, 1 } } }, // 40
+ {{ { 6, 6 }, { 5, 1 } } }, // 41
+ {{ { 6, 7 }, { 0, 0 } } }, // 42
+ {{ { 6, 7 }, { 1, 1 } } }, // 43
+ {{ { 6, 7 }, { 2, 1 } } }, // 44
+ {{ { 6, 7 }, { 3, 1 } } }, // 45
+ {{ { 6, 7 }, { 4, 1 } } }, // 46
+ {{ { 6, 7 }, { 5, 1 } } }, // 47
+ {{ { 6, 8 }, { 0, 0 } } }, // 48
+ {{ { 6, 8 }, { 1, 1 } } }, // 49
+ {{ { 6, 8 }, { 2, 1 } } }, // 50
+ {{ { 6, 8 }, { 3, 1 } } }, // 51
+ {{ { 6, 8 }, { 4, 1 } } }, // 52
+ {{ { 6, 8 }, { 5, 1 } } }, // 53
+ {{ { 6, 9 }, { 0, 0 } } }, // 54
+ {{ { 6, 9 }, { 1, 1 } } }, // 55
+ {{ { 6, 9 }, { 2, 1 } } }, // 56
+ {{ { 6, 9 }, { 3, 1 } } }, // 57
+ {{ { 6, 9 }, { 4, 1 } } }, // 58
+ {{ { 6, 9 }, { 5, 1 } } }, // 59
+ {{ { 6, 10 }, { 0, 0 } } }, // 60
+ {{ { 6, 10 }, { 1, 1 } } }, // 61
+ {{ { 6, 10 }, { 2, 1 } } }, // 62
+ {{ { 6, 10 }, { 3, 1 } } }, // 63
+ {{ { 6, 10 }, { 4, 1 } } }, // 64
+ {{ { 6, 10 }, { 5, 1 } } }, // 65
+ {{ { 6, 11 }, { 0, 0 } } }, // 66
+ {{ { 6, 11 }, { 1, 1 } } }, // 67
+ {{ { 6, 11 }, { 2, 1 } } }, // 68
+ {{ { 6, 11 }, { 3, 1 } } }, // 69
+ {{ { 6, 11 }, { 4, 1 } } }, // 70
+ {{ { 6, 11 }, { 5, 1 } } }, // 71
+ {{ { 6, 12 }, { 0, 0 } } }, // 72
+ {{ { 6, 12 }, { 1, 1 } } }, // 73
+ {{ { 6, 12 }, { 2, 1 } } }, // 74
+ {{ { 6, 12 }, { 3, 1 } } }, // 75
+ {{ { 6, 12 }, { 4, 1 } } }, // 76
+ {{ { 6, 12 }, { 5, 1 } } }, // 77
+ {{ { 6, 13 }, { 0, 0 } } }, // 78
+ {{ { 6, 13 }, { 1, 1 } } }, // 79
+ {{ { 6, 13 }, { 2, 1 } } }, // 80
+ {{ { 6, 13 }, { 3, 1 } } }, // 81
+ {{ { 6, 13 }, { 4, 1 } } }, // 82
+ {{ { 6, 13 }, { 5, 1 } } }, // 83
+ {{ { 6, 14 }, { 0, 0 } } }, // 84
+ {{ { 6, 14 }, { 1, 1 } } }, // 85
+ {{ { 6, 14 }, { 2, 1 } } }, // 86
+ {{ { 6, 14 }, { 3, 1 } } }, // 87
+ {{ { 6, 14 }, { 4, 1 } } }, // 88
+ {{ { 6, 14 }, { 5, 1 } } }, // 89
+ {{ { 6, 15 }, { 0, 0 } } }, // 90
+ {{ { 6, 15 }, { 1, 1 } } }, // 91
+ {{ { 6, 15 }, { 2, 1 } } }, // 92
+ {{ { 6, 15 }, { 3, 1 } } }, // 93
+ {{ { 6, 15 }, { 4, 1 } } }, // 94
+ {{ { 6, 15 }, { 5, 1 } } }, // 95
+ {{ { 6, 16 }, { 0, 0 } } }, // 96
+ {{ { 6, 16 }, { 1, 1 } } }, // 97
+ {{ { 6, 16 }, { 2, 1 } } }, // 98
+ {{ { 6, 16 }, { 3, 1 } } }, // 99
+ {{ { 6, 16 }, { 4, 1 } } }, // 100
+ {{ { 6, 16 }, { 5, 1 } } }, // 101
+ {{ { 6, 17 }, { 0, 0 } } }, // 102
+ {{ { 6, 17 }, { 1, 1 } } }, // 103
+ {{ { 6, 17 }, { 2, 1 } } }, // 104
+ {{ { 6, 17 }, { 3, 1 } } }, // 105
+ {{ { 6, 17 }, { 4, 1 } } }, // 106
+ {{ { 6, 17 }, { 5, 1 } } }, // 107
+ {{ { 6, 18 }, { 0, 0 } } }, // 108
+ {{ { 6, 18 }, { 1, 1 } } }, // 109
+ {{ { 6, 18 }, { 2, 1 } } }, // 110
+ {{ { 6, 18 }, { 3, 1 } } }, // 111
+ {{ { 6, 18 }, { 4, 1 } } }, // 112
+ {{ { 6, 18 }, { 5, 1 } } }, // 113
+ {{ { 6, 19 }, { 0, 0 } } }, // 114
+ {{ { 6, 19 }, { 1, 1 } } }, // 115
+ {{ { 6, 19 }, { 2, 1 } } }, // 116
+ {{ { 6, 19 }, { 3, 1 } } }, // 117
+ {{ { 6, 19 }, { 4, 1 } } }, // 118
+ {{ { 6, 19 }, { 5, 1 } } }, // 119
+ {{ { 6, 20 }, { 0, 0 } } }, // 120
}
};
};
-constexpr array<KernelInfo::knl_ptr, 15> KernelInfo::kernel;
+constexpr array<KernelInfo::knl_ptr, 7> KernelInfo::kernel;
constexpr array<array<array<int, 2>, 2>, 121> KernelInfo::partition;
// autotuned kernel splits for various cases m = 1:mb_max
@@ -208,8 +203,8 @@ FBGEMM_API void cblas_gemm_compute(
const int n = Bp.numCols(), k = Bp.numRows(), ldc = n;
const int mb_max = 120;
constexpr int simd_width = 8;
- constexpr int kernel_ncol_blocks = 1;
- constexpr int kernel_ncols = kernel_ncol_blocks * simd_width;
+ int kernel_ncol_blocks = Bp.kernelNumColBlocks();
+ int kernel_ncols = kernel_ncol_blocks * simd_width;
// private scratchpad storage
static thread_local unique_ptr<std::array<float, 256 * 1024>> scratchpad(
@@ -267,7 +262,7 @@ FBGEMM_API void cblas_gemm_compute(
fbgemmGetRange(
num_threads, thread_id, gp.b_block_cols, 1, jb_begin, jb_end);
gp.B += gp.k * Bp.blockColSize() * jb_begin;
- gp.C += 8 * jb_begin;
+ gp.C += Bp.blockColSize() * jb_begin;
gp.b_block_cols = jb_end - jb_begin;
if (gp.b_block_cols) {
KernelInfo::kernel[kernel_nrows](&gp);
@@ -279,7 +274,7 @@ FBGEMM_API void cblas_gemm_compute(
fbgemmGetRange(
num_threads, thread_id, gp.b_block_cols, 1, jb_begin, jb_end);
gp.B += gp.k * Bp.blockColSize() * jb_begin;
- gp.C += 8 * jb_begin;
+ gp.C += Bp.blockColSize() * jb_begin;
gp.b_block_cols = jb_end - jb_begin;
if (gp.b_block_cols) {
KernelInfo::kernel[kernel_nrows](&gp);
@@ -291,35 +286,36 @@ FBGEMM_API void cblas_gemm_compute(
// leftover
int rem = n - last_blk_col;
assert(rem < kernel_ncols);
- int b = (rem % simd_width) ? ((rem + simd_width) / simd_width)
- : (rem / simd_width);
- assert(b == 1);
- if ((rem % simd_width) == 0) {
+
+ if ((rem % Bp.blockColSize()) == 0) {
gp.B = &(Bp(k_ind, last_blk_col));
gp.C = &C[m2 * ldc + last_blk_col];
gp.b_block_cols = 1;
KernelInfo::kernel[kernel_nrows](&gp);
} else {
- // small temporary buffer
+ // small temporary buffer: the size should be larger than the
+ // required kernel_nrow x kernel_ncols elements computed in the
+ // registers.
float c_tmp[16 * 24] = {0};
assert((16 * 24) > kernel_nrows * kernel_ncols);
gp.B = &(Bp(k_ind, last_blk_col));
gp.C = c_tmp;
- gp.ldc = 8 * sizeof(C[0]);
+ gp.ldc = kernel_ncols * sizeof(C[0]);
gp.b_block_cols = 1;
KernelInfo::kernel[kernel_nrows](&gp);
for (int i = 0; i < kernel_nrows; i++) {
// Todo: use assembly
for (int j = last_blk_col; j < n; j++) {
assert(
- i * 8 + (j - last_blk_col) <
+ i * kernel_ncols + (j - last_blk_col) <
sizeof(c_tmp) / sizeof(c_tmp[0]));
if (accum == 0) {
- C[(m2 + i) * ldc + j] = c_tmp[i * 8 + (j - last_blk_col)];
+ C[(m2 + i) * ldc + j] =
+ c_tmp[i * kernel_ncols + (j - last_blk_col)];
} else {
C[(m2 + i) * ldc + j] = beta_ * C[(m2 + i) * ldc + j] +
- c_tmp[i * 8 + (j - last_blk_col)];
+ c_tmp[i * kernel_ncols + (j - last_blk_col)];
}
}
}
diff --git a/src/FbgemmFP16UKernelsAvx2.cc b/src/FbgemmFP16UKernelsAvx2.cc
index 8a0cb0d..0c795b0 100644
--- a/src/FbgemmFP16UKernelsAvx2.cc
+++ b/src/FbgemmFP16UKernelsAvx2.cc
@@ -8,108 +8,7 @@
namespace fbgemm {
-void __attribute__((noinline)) gemmkernel_1x1_AVX2_fA0fB0fC0(GemmParams* gp) {
- asm volatile(
-#if !defined(__clang__)
- "mov r14, %[gp]\t\n"
-#else
- "mov %[gp], %%r14\t\n"
- ".intel_syntax noprefix\t\n"
-#endif
-
- // Copy parameters
- // k
- "mov r8, [r14 + 0]\t\n"
- // A
- "mov r9, [r14 + 8]\t\n"
- // B
- "mov r10, [r14 + 16]\t\n"
- // beta
- "mov r15, [r14 + 24]\t\n"
- // accum
- "mov rdx, [r14 + 32]\t\n"
- // C
- "mov r12, [r14 + 40]\t\n"
- // ldc
- "mov r13, [r14 + 48]\t\n"
- // b_block_cols
- "mov rdi, [r14 + 56]\t\n"
- // b_block_size
- "mov rsi, [r14 + 64]\t\n"
- // Make copies of A and C
- "mov rax, r9\t\n"
- "mov rcx, r12\t\n"
-
- "mov rbx, 0\t\n"
- "loop_outter%=:\t\n"
- "mov r14, 0\t\n"
- "vxorps ymm0,ymm0,ymm0\t\n"
-
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
-
- "loop_inner%=:\t\n"
-
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm1,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm1\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm1,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm1\t\n"
- "add r11, 32\t\n"
- "add r9,8\t\n"
- "cmp r14, r8\t\n"
- "jl loop_inner%=\t\n"
-
- "L_exit%=:\t\n"
- "add r10, rsi\t\n"
-
- "cmp rdx, 1\t\n"
- "je L_accum%=\t\n"
- // Dump C
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "jmp L_done%=\t\n"
-
- "L_accum%=:\t\n"
- // Dump C with accumulate
- "vbroadcastss ymm15,DWORD PTR [r15]\t\n"
- "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
-
- "L_done%=:\t\n"
-
- // next outer iteration
- "add rcx, 32\t\n"
- "mov r12, rcx\t\n"
- "mov r9, rax\t\n"
- "inc rbx\t\n"
- "cmp rbx, rdi\t\n"
- "jl loop_outter%=\t\n"
- :
- : [gp] "rm"(gp)
- : "r8",
- "r9",
- "r10",
- "r11",
- "r15",
- "r13",
- "r14",
- "rax",
- "rcx",
- "rdx",
- "rsi",
- "rdi",
- "rbx",
- "r12",
- "memory");
-}
-void __attribute__((noinline)) gemmkernel_2x1_AVX2_fA0fB0fC0(GemmParams* gp) {
+void __attribute__((noinline)) gemmkernel_1x2_AVX2_fA0fB0fC0(GemmParams* gp) {
asm volatile(
#if !defined(__clang__)
"mov r14, %[gp]\t\n"
@@ -147,157 +46,27 @@ void __attribute__((noinline)) gemmkernel_2x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vxorps ymm0,ymm0,ymm0\t\n"
"vxorps ymm1,ymm1,ymm1\t\n"
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
"loop_inner%=:\t\n"
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
+ "vcvtph2ps ymm3,XMMWORD PTR [r10 + 0]\t\n"
+ "vcvtph2ps ymm4,XMMWORD PTR [r10 + 16]\t\n"
"vbroadcastss ymm2,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm2\t\n"
- "vbroadcastss ymm2,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm2\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm2,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm2\t\n"
- "vbroadcastss ymm2,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm2\t\n"
- "add r11, 32\t\n"
- "add r9,16\t\n"
- "cmp r14, r8\t\n"
- "jl loop_inner%=\t\n"
-
- "L_exit%=:\t\n"
- "add r10, rsi\t\n"
-
- "cmp rdx, 1\t\n"
- "je L_accum%=\t\n"
- // Dump C
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "jmp L_done%=\t\n"
-
- "L_accum%=:\t\n"
- // Dump C with accumulate
- "vbroadcastss ymm15,DWORD PTR [r15]\t\n"
- "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
-
- "L_done%=:\t\n"
-
- // next outer iteration
- "add rcx, 32\t\n"
- "mov r12, rcx\t\n"
- "mov r9, rax\t\n"
- "inc rbx\t\n"
- "cmp rbx, rdi\t\n"
- "jl loop_outter%=\t\n"
- :
- : [gp] "rm"(gp)
- : "r8",
- "r9",
- "r10",
- "r11",
- "r15",
- "r13",
- "r14",
- "rax",
- "rcx",
- "rdx",
- "rsi",
- "rdi",
- "rbx",
- "r12",
- "memory");
-}
-void __attribute__((noinline)) gemmkernel_3x1_AVX2_fA0fB0fC0(GemmParams* gp) {
- asm volatile(
-#if !defined(__clang__)
- "mov r14, %[gp]\t\n"
-#else
- "mov %[gp], %%r14\t\n"
- ".intel_syntax noprefix\t\n"
-#endif
-
- // Copy parameters
- // k
- "mov r8, [r14 + 0]\t\n"
- // A
- "mov r9, [r14 + 8]\t\n"
- // B
- "mov r10, [r14 + 16]\t\n"
- // beta
- "mov r15, [r14 + 24]\t\n"
- // accum
- "mov rdx, [r14 + 32]\t\n"
- // C
- "mov r12, [r14 + 40]\t\n"
- // ldc
- "mov r13, [r14 + 48]\t\n"
- // b_block_cols
- "mov rdi, [r14 + 56]\t\n"
- // b_block_size
- "mov rsi, [r14 + 64]\t\n"
- // Make copies of A and C
- "mov rax, r9\t\n"
- "mov rcx, r12\t\n"
-
- "mov rbx, 0\t\n"
- "loop_outter%=:\t\n"
- "mov r14, 0\t\n"
- "vxorps ymm0,ymm0,ymm0\t\n"
- "vxorps ymm1,ymm1,ymm1\t\n"
- "vxorps ymm2,ymm2,ymm2\t\n"
-
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
-
- "loop_inner%=:\t\n"
-
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
+ "vfmadd231ps ymm0,ymm3,ymm2\t\n"
+ "vfmadd231ps ymm1,ymm4,ymm2\t\n"
+ "add r9,4\t\n"
+ "add r10,32\t\n"
"inc r14\t\n"
- "vbroadcastss ymm3,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm3\t\n"
- "vbroadcastss ymm3,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm3\t\n"
- "vbroadcastss ymm3,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm3\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm3,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm3\t\n"
- "vbroadcastss ymm3,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm3\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm3,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm3\t\n"
- "add r9,24\t\n"
"cmp r14, r8\t\n"
"jl loop_inner%=\t\n"
"L_exit%=:\t\n"
- "add r10, rsi\t\n"
"cmp rdx, 1\t\n"
"je L_accum%=\t\n"
// Dump C
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"jmp L_done%=\t\n"
@@ -306,18 +75,14 @@ void __attribute__((noinline)) gemmkernel_3x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vbroadcastss ymm15,DWORD PTR [r15]\t\n"
"vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
+ "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"L_done%=:\t\n"
// next outer iteration
- "add rcx, 32\t\n"
+ "add rcx, 64\t\n"
"mov r12, rcx\t\n"
"mov r9, rax\t\n"
"inc rbx\t\n"
@@ -341,7 +106,7 @@ void __attribute__((noinline)) gemmkernel_3x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"r12",
"memory");
}
-void __attribute__((noinline)) gemmkernel_4x1_AVX2_fA0fB0fC0(GemmParams* gp) {
+void __attribute__((noinline)) gemmkernel_2x2_AVX2_fA0fB0fC0(GemmParams* gp) {
asm volatile(
#if !defined(__clang__)
"mov r14, %[gp]\t\n"
@@ -381,189 +146,33 @@ void __attribute__((noinline)) gemmkernel_4x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vxorps ymm2,ymm2,ymm2\t\n"
"vxorps ymm3,ymm3,ymm3\t\n"
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
"loop_inner%=:\t\n"
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
+ "vcvtph2ps ymm5,XMMWORD PTR [r10 + 0]\t\n"
+ "vcvtph2ps ymm6,XMMWORD PTR [r10 + 16]\t\n"
"vbroadcastss ymm4,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm4\t\n"
+ "vfmadd231ps ymm0,ymm5,ymm4\t\n"
+ "vfmadd231ps ymm1,ymm6,ymm4\t\n"
"vbroadcastss ymm4,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm4\t\n"
- "vbroadcastss ymm4,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm4\t\n"
- "vbroadcastss ymm4,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm4\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm4,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm4\t\n"
- "vbroadcastss ymm4,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm4\t\n"
- "vbroadcastss ymm4,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm4\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm4,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm3,ymm14,ymm4\t\n"
- "add r9,32\t\n"
- "cmp r14, r8\t\n"
- "jl loop_inner%=\t\n"
-
- "L_exit%=:\t\n"
- "add r10, rsi\t\n"
-
- "cmp rdx, 1\t\n"
- "je L_accum%=\t\n"
- // Dump C
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "jmp L_done%=\t\n"
-
- "L_accum%=:\t\n"
- // Dump C with accumulate
- "vbroadcastss ymm15,DWORD PTR [r15]\t\n"
- "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
-
- "L_done%=:\t\n"
-
- // next outer iteration
- "add rcx, 32\t\n"
- "mov r12, rcx\t\n"
- "mov r9, rax\t\n"
- "inc rbx\t\n"
- "cmp rbx, rdi\t\n"
- "jl loop_outter%=\t\n"
- :
- : [gp] "rm"(gp)
- : "r8",
- "r9",
- "r10",
- "r11",
- "r15",
- "r13",
- "r14",
- "rax",
- "rcx",
- "rdx",
- "rsi",
- "rdi",
- "rbx",
- "r12",
- "memory");
-}
-void __attribute__((noinline)) gemmkernel_5x1_AVX2_fA0fB0fC0(GemmParams* gp) {
- asm volatile(
-#if !defined(__clang__)
- "mov r14, %[gp]\t\n"
-#else
- "mov %[gp], %%r14\t\n"
- ".intel_syntax noprefix\t\n"
-#endif
-
- // Copy parameters
- // k
- "mov r8, [r14 + 0]\t\n"
- // A
- "mov r9, [r14 + 8]\t\n"
- // B
- "mov r10, [r14 + 16]\t\n"
- // beta
- "mov r15, [r14 + 24]\t\n"
- // accum
- "mov rdx, [r14 + 32]\t\n"
- // C
- "mov r12, [r14 + 40]\t\n"
- // ldc
- "mov r13, [r14 + 48]\t\n"
- // b_block_cols
- "mov rdi, [r14 + 56]\t\n"
- // b_block_size
- "mov rsi, [r14 + 64]\t\n"
- // Make copies of A and C
- "mov rax, r9\t\n"
- "mov rcx, r12\t\n"
-
- "mov rbx, 0\t\n"
- "loop_outter%=:\t\n"
- "mov r14, 0\t\n"
- "vxorps ymm0,ymm0,ymm0\t\n"
- "vxorps ymm1,ymm1,ymm1\t\n"
- "vxorps ymm2,ymm2,ymm2\t\n"
- "vxorps ymm3,ymm3,ymm3\t\n"
- "vxorps ymm4,ymm4,ymm4\t\n"
-
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
-
- "loop_inner%=:\t\n"
-
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm5\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm5\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm5\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm5\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm5\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
+ "vfmadd231ps ymm2,ymm5,ymm4\t\n"
+ "vfmadd231ps ymm3,ymm6,ymm4\t\n"
+ "add r9,8\t\n"
+ "add r10,32\t\n"
"inc r14\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm5\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm5\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm5\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm3,ymm14,ymm5\t\n"
- "vbroadcastss ymm5,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm4,ymm14,ymm5\t\n"
- "add r9,40\t\n"
"cmp r14, r8\t\n"
"jl loop_inner%=\t\n"
"L_exit%=:\t\n"
- "add r10, rsi\t\n"
"cmp rdx, 1\t\n"
"je L_accum%=\t\n"
// Dump C
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"jmp L_done%=\t\n"
@@ -572,24 +181,19 @@ void __attribute__((noinline)) gemmkernel_5x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vbroadcastss ymm15,DWORD PTR [r15]\t\n"
"vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
+ "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"L_done%=:\t\n"
// next outer iteration
- "add rcx, 32\t\n"
+ "add rcx, 64\t\n"
"mov r12, rcx\t\n"
"mov r9, rax\t\n"
"inc rbx\t\n"
@@ -613,7 +217,7 @@ void __attribute__((noinline)) gemmkernel_5x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"r12",
"memory");
}
-void __attribute__((noinline)) gemmkernel_6x1_AVX2_fA0fB0fC0(GemmParams* gp) {
+void __attribute__((noinline)) gemmkernel_3x2_AVX2_fA0fB0fC0(GemmParams* gp) {
asm volatile(
#if !defined(__clang__)
"mov r14, %[gp]\t\n"
@@ -655,221 +259,39 @@ void __attribute__((noinline)) gemmkernel_6x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vxorps ymm4,ymm4,ymm4\t\n"
"vxorps ymm5,ymm5,ymm5\t\n"
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
"loop_inner%=:\t\n"
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
+ "vcvtph2ps ymm7,XMMWORD PTR [r10 + 0]\t\n"
+ "vcvtph2ps ymm8,XMMWORD PTR [r10 + 16]\t\n"
"vbroadcastss ymm6,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm6\t\n"
+ "vfmadd231ps ymm0,ymm7,ymm6\t\n"
+ "vfmadd231ps ymm1,ymm8,ymm6\t\n"
"vbroadcastss ymm6,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm6\t\n"
+ "vfmadd231ps ymm2,ymm7,ymm6\t\n"
+ "vfmadd231ps ymm3,ymm8,ymm6\t\n"
"vbroadcastss ymm6,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm6\t\n"
- "vbroadcastss ymm6,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm6\t\n"
- "vbroadcastss ymm6,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm6\t\n"
- "vbroadcastss ymm6,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm5,ymm15,ymm6\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm6,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm6\t\n"
- "vbroadcastss ymm6,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm6\t\n"
- "vbroadcastss ymm6,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm6\t\n"
- "vbroadcastss ymm6,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm3,ymm14,ymm6\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm6,DWORD PTR [r9+40]\t\n"
- "vfmadd231ps ymm4,ymm14,ymm6\t\n"
- "vbroadcastss ymm6,DWORD PTR [r9+44]\t\n"
- "vfmadd231ps ymm5,ymm14,ymm6\t\n"
- "add r9,48\t\n"
- "cmp r14, r8\t\n"
- "jl loop_inner%=\t\n"
-
- "L_exit%=:\t\n"
- "add r10, rsi\t\n"
-
- "cmp rdx, 1\t\n"
- "je L_accum%=\t\n"
- // Dump C
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "jmp L_done%=\t\n"
-
- "L_accum%=:\t\n"
- // Dump C with accumulate
- "vbroadcastss ymm15,DWORD PTR [r15]\t\n"
- "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
-
- "L_done%=:\t\n"
-
- // next outer iteration
- "add rcx, 32\t\n"
- "mov r12, rcx\t\n"
- "mov r9, rax\t\n"
- "inc rbx\t\n"
- "cmp rbx, rdi\t\n"
- "jl loop_outter%=\t\n"
- :
- : [gp] "rm"(gp)
- : "r8",
- "r9",
- "r10",
- "r11",
- "r15",
- "r13",
- "r14",
- "rax",
- "rcx",
- "rdx",
- "rsi",
- "rdi",
- "rbx",
- "r12",
- "memory");
-}
-void __attribute__((noinline)) gemmkernel_7x1_AVX2_fA0fB0fC0(GemmParams* gp) {
- asm volatile(
-#if !defined(__clang__)
- "mov r14, %[gp]\t\n"
-#else
- "mov %[gp], %%r14\t\n"
- ".intel_syntax noprefix\t\n"
-#endif
-
- // Copy parameters
- // k
- "mov r8, [r14 + 0]\t\n"
- // A
- "mov r9, [r14 + 8]\t\n"
- // B
- "mov r10, [r14 + 16]\t\n"
- // beta
- "mov r15, [r14 + 24]\t\n"
- // accum
- "mov rdx, [r14 + 32]\t\n"
- // C
- "mov r12, [r14 + 40]\t\n"
- // ldc
- "mov r13, [r14 + 48]\t\n"
- // b_block_cols
- "mov rdi, [r14 + 56]\t\n"
- // b_block_size
- "mov rsi, [r14 + 64]\t\n"
- // Make copies of A and C
- "mov rax, r9\t\n"
- "mov rcx, r12\t\n"
-
- "mov rbx, 0\t\n"
- "loop_outter%=:\t\n"
- "mov r14, 0\t\n"
- "vxorps ymm0,ymm0,ymm0\t\n"
- "vxorps ymm1,ymm1,ymm1\t\n"
- "vxorps ymm2,ymm2,ymm2\t\n"
- "vxorps ymm3,ymm3,ymm3\t\n"
- "vxorps ymm4,ymm4,ymm4\t\n"
- "vxorps ymm5,ymm5,ymm5\t\n"
- "vxorps ymm6,ymm6,ymm6\t\n"
-
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
-
- "loop_inner%=:\t\n"
-
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
+ "vfmadd231ps ymm4,ymm7,ymm6\t\n"
+ "vfmadd231ps ymm5,ymm8,ymm6\t\n"
+ "add r9,12\t\n"
+ "add r10,32\t\n"
"inc r14\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm5,ymm15,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm6,ymm15,ymm7\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+40]\t\n"
- "vfmadd231ps ymm3,ymm14,ymm7\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+44]\t\n"
- "vfmadd231ps ymm4,ymm14,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+48]\t\n"
- "vfmadd231ps ymm5,ymm14,ymm7\t\n"
- "vbroadcastss ymm7,DWORD PTR [r9+52]\t\n"
- "vfmadd231ps ymm6,ymm14,ymm7\t\n"
- "add r9,56\t\n"
"cmp r14, r8\t\n"
"jl loop_inner%=\t\n"
"L_exit%=:\t\n"
- "add r10, rsi\t\n"
"cmp rdx, 1\t\n"
"je L_accum%=\t\n"
// Dump C
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n"
"add r12, r13\t\n"
"jmp L_done%=\t\n"
@@ -878,30 +300,24 @@ void __attribute__((noinline)) gemmkernel_7x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vbroadcastss ymm15,DWORD PTR [r15]\t\n"
"vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
+ "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
+ "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n"
"add r12, r13\t\n"
"L_done%=:\t\n"
// next outer iteration
- "add rcx, 32\t\n"
+ "add rcx, 64\t\n"
"mov r12, rcx\t\n"
"mov r9, rax\t\n"
"inc rbx\t\n"
@@ -925,7 +341,7 @@ void __attribute__((noinline)) gemmkernel_7x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"r12",
"memory");
}
-void __attribute__((noinline)) gemmkernel_8x1_AVX2_fA0fB0fC0(GemmParams* gp) {
+void __attribute__((noinline)) gemmkernel_4x2_AVX2_fA0fB0fC0(GemmParams* gp) {
asm volatile(
#if !defined(__clang__)
"mov r14, %[gp]\t\n"
@@ -969,253 +385,45 @@ void __attribute__((noinline)) gemmkernel_8x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vxorps ymm6,ymm6,ymm6\t\n"
"vxorps ymm7,ymm7,ymm7\t\n"
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
"loop_inner%=:\t\n"
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
+ "vcvtph2ps ymm9,XMMWORD PTR [r10 + 0]\t\n"
+ "vcvtph2ps ymm10,XMMWORD PTR [r10 + 16]\t\n"
"vbroadcastss ymm8,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm8\t\n"
+ "vfmadd231ps ymm0,ymm9,ymm8\t\n"
+ "vfmadd231ps ymm1,ymm10,ymm8\t\n"
"vbroadcastss ymm8,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm8\t\n"
+ "vfmadd231ps ymm2,ymm9,ymm8\t\n"
+ "vfmadd231ps ymm3,ymm10,ymm8\t\n"
"vbroadcastss ymm8,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm8\t\n"
+ "vfmadd231ps ymm4,ymm9,ymm8\t\n"
+ "vfmadd231ps ymm5,ymm10,ymm8\t\n"
"vbroadcastss ymm8,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm5,ymm15,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm6,ymm15,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm7,ymm15,ymm8\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+40]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+44]\t\n"
- "vfmadd231ps ymm3,ymm14,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+48]\t\n"
- "vfmadd231ps ymm4,ymm14,ymm8\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+52]\t\n"
- "vfmadd231ps ymm5,ymm14,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+56]\t\n"
- "vfmadd231ps ymm6,ymm14,ymm8\t\n"
- "vbroadcastss ymm8,DWORD PTR [r9+60]\t\n"
- "vfmadd231ps ymm7,ymm14,ymm8\t\n"
- "add r9,64\t\n"
- "cmp r14, r8\t\n"
- "jl loop_inner%=\t\n"
-
- "L_exit%=:\t\n"
- "add r10, rsi\t\n"
-
- "cmp rdx, 1\t\n"
- "je L_accum%=\t\n"
- // Dump C
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
- "jmp L_done%=\t\n"
-
- "L_accum%=:\t\n"
- // Dump C with accumulate
- "vbroadcastss ymm15,DWORD PTR [r15]\t\n"
- "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
-
- "L_done%=:\t\n"
-
- // next outer iteration
- "add rcx, 32\t\n"
- "mov r12, rcx\t\n"
- "mov r9, rax\t\n"
- "inc rbx\t\n"
- "cmp rbx, rdi\t\n"
- "jl loop_outter%=\t\n"
- :
- : [gp] "rm"(gp)
- : "r8",
- "r9",
- "r10",
- "r11",
- "r15",
- "r13",
- "r14",
- "rax",
- "rcx",
- "rdx",
- "rsi",
- "rdi",
- "rbx",
- "r12",
- "memory");
-}
-void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp) {
- asm volatile(
-#if !defined(__clang__)
- "mov r14, %[gp]\t\n"
-#else
- "mov %[gp], %%r14\t\n"
- ".intel_syntax noprefix\t\n"
-#endif
-
- // Copy parameters
- // k
- "mov r8, [r14 + 0]\t\n"
- // A
- "mov r9, [r14 + 8]\t\n"
- // B
- "mov r10, [r14 + 16]\t\n"
- // beta
- "mov r15, [r14 + 24]\t\n"
- // accum
- "mov rdx, [r14 + 32]\t\n"
- // C
- "mov r12, [r14 + 40]\t\n"
- // ldc
- "mov r13, [r14 + 48]\t\n"
- // b_block_cols
- "mov rdi, [r14 + 56]\t\n"
- // b_block_size
- "mov rsi, [r14 + 64]\t\n"
- // Make copies of A and C
- "mov rax, r9\t\n"
- "mov rcx, r12\t\n"
-
- "mov rbx, 0\t\n"
- "loop_outter%=:\t\n"
- "mov r14, 0\t\n"
- "vxorps ymm0,ymm0,ymm0\t\n"
- "vxorps ymm1,ymm1,ymm1\t\n"
- "vxorps ymm2,ymm2,ymm2\t\n"
- "vxorps ymm3,ymm3,ymm3\t\n"
- "vxorps ymm4,ymm4,ymm4\t\n"
- "vxorps ymm5,ymm5,ymm5\t\n"
- "vxorps ymm6,ymm6,ymm6\t\n"
- "vxorps ymm7,ymm7,ymm7\t\n"
- "vxorps ymm8,ymm8,ymm8\t\n"
-
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
-
- "loop_inner%=:\t\n"
-
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm5,ymm15,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm6,ymm15,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm7,ymm15,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm8,ymm15,ymm9\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
+ "vfmadd231ps ymm6,ymm9,ymm8\t\n"
+ "vfmadd231ps ymm7,ymm10,ymm8\t\n"
+ "add r9,16\t\n"
+ "add r10,32\t\n"
"inc r14\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+40]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+44]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+48]\t\n"
- "vfmadd231ps ymm3,ymm14,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+52]\t\n"
- "vfmadd231ps ymm4,ymm14,ymm9\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+56]\t\n"
- "vfmadd231ps ymm5,ymm14,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+60]\t\n"
- "vfmadd231ps ymm6,ymm14,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+64]\t\n"
- "vfmadd231ps ymm7,ymm14,ymm9\t\n"
- "vbroadcastss ymm9,DWORD PTR [r9+68]\t\n"
- "vfmadd231ps ymm8,ymm14,ymm9\t\n"
- "add r9,72\t\n"
"cmp r14, r8\t\n"
"jl loop_inner%=\t\n"
"L_exit%=:\t\n"
- "add r10, rsi\t\n"
"cmp rdx, 1\t\n"
"je L_accum%=\t\n"
// Dump C
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n"
"add r12, r13\t\n"
"jmp L_done%=\t\n"
@@ -1224,36 +432,29 @@ void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vbroadcastss ymm15,DWORD PTR [r15]\t\n"
"vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
+ "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
+ "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
+ "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n"
"add r12, r13\t\n"
"L_done%=:\t\n"
// next outer iteration
- "add rcx, 32\t\n"
+ "add rcx, 64\t\n"
"mov r12, rcx\t\n"
"mov r9, rax\t\n"
"inc rbx\t\n"
@@ -1277,7 +478,7 @@ void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"r12",
"memory");
}
-void __attribute__((noinline)) gemmkernel_10x1_AVX2_fA0fB0fC0(GemmParams* gp) {
+void __attribute__((noinline)) gemmkernel_5x2_AVX2_fA0fB0fC0(GemmParams* gp) {
asm volatile(
#if !defined(__clang__)
"mov r14, %[gp]\t\n"
@@ -1323,285 +524,51 @@ void __attribute__((noinline)) gemmkernel_10x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vxorps ymm8,ymm8,ymm8\t\n"
"vxorps ymm9,ymm9,ymm9\t\n"
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
"loop_inner%=:\t\n"
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
+ "vcvtph2ps ymm11,XMMWORD PTR [r10 + 0]\t\n"
+ "vcvtph2ps ymm12,XMMWORD PTR [r10 + 16]\t\n"
"vbroadcastss ymm10,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm10\t\n"
+ "vfmadd231ps ymm0,ymm11,ymm10\t\n"
+ "vfmadd231ps ymm1,ymm12,ymm10\t\n"
"vbroadcastss ymm10,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm10\t\n"
+ "vfmadd231ps ymm2,ymm11,ymm10\t\n"
+ "vfmadd231ps ymm3,ymm12,ymm10\t\n"
"vbroadcastss ymm10,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm10\t\n"
+ "vfmadd231ps ymm4,ymm11,ymm10\t\n"
+ "vfmadd231ps ymm5,ymm12,ymm10\t\n"
"vbroadcastss ymm10,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm10\t\n"
+ "vfmadd231ps ymm6,ymm11,ymm10\t\n"
+ "vfmadd231ps ymm7,ymm12,ymm10\t\n"
"vbroadcastss ymm10,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm5,ymm15,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm6,ymm15,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm7,ymm15,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm8,ymm15,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm9,ymm15,ymm10\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+40]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+44]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+48]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+52]\t\n"
- "vfmadd231ps ymm3,ymm14,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+56]\t\n"
- "vfmadd231ps ymm4,ymm14,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+60]\t\n"
- "vfmadd231ps ymm5,ymm14,ymm10\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+64]\t\n"
- "vfmadd231ps ymm6,ymm14,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+68]\t\n"
- "vfmadd231ps ymm7,ymm14,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+72]\t\n"
- "vfmadd231ps ymm8,ymm14,ymm10\t\n"
- "vbroadcastss ymm10,DWORD PTR [r9+76]\t\n"
- "vfmadd231ps ymm9,ymm14,ymm10\t\n"
- "add r9,80\t\n"
- "cmp r14, r8\t\n"
- "jl loop_inner%=\t\n"
-
- "L_exit%=:\t\n"
- "add r10, rsi\t\n"
-
- "cmp rdx, 1\t\n"
- "je L_accum%=\t\n"
- // Dump C
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
- "add r12, r13\t\n"
- "jmp L_done%=\t\n"
-
- "L_accum%=:\t\n"
- // Dump C with accumulate
- "vbroadcastss ymm15,DWORD PTR [r15]\t\n"
- "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
- "add r12, r13\t\n"
-
- "L_done%=:\t\n"
-
- // next outer iteration
- "add rcx, 32\t\n"
- "mov r12, rcx\t\n"
- "mov r9, rax\t\n"
- "inc rbx\t\n"
- "cmp rbx, rdi\t\n"
- "jl loop_outter%=\t\n"
- :
- : [gp] "rm"(gp)
- : "r8",
- "r9",
- "r10",
- "r11",
- "r15",
- "r13",
- "r14",
- "rax",
- "rcx",
- "rdx",
- "rsi",
- "rdi",
- "rbx",
- "r12",
- "memory");
-}
-void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp) {
- asm volatile(
-#if !defined(__clang__)
- "mov r14, %[gp]\t\n"
-#else
- "mov %[gp], %%r14\t\n"
- ".intel_syntax noprefix\t\n"
-#endif
-
- // Copy parameters
- // k
- "mov r8, [r14 + 0]\t\n"
- // A
- "mov r9, [r14 + 8]\t\n"
- // B
- "mov r10, [r14 + 16]\t\n"
- // beta
- "mov r15, [r14 + 24]\t\n"
- // accum
- "mov rdx, [r14 + 32]\t\n"
- // C
- "mov r12, [r14 + 40]\t\n"
- // ldc
- "mov r13, [r14 + 48]\t\n"
- // b_block_cols
- "mov rdi, [r14 + 56]\t\n"
- // b_block_size
- "mov rsi, [r14 + 64]\t\n"
- // Make copies of A and C
- "mov rax, r9\t\n"
- "mov rcx, r12\t\n"
-
- "mov rbx, 0\t\n"
- "loop_outter%=:\t\n"
- "mov r14, 0\t\n"
- "vxorps ymm0,ymm0,ymm0\t\n"
- "vxorps ymm1,ymm1,ymm1\t\n"
- "vxorps ymm2,ymm2,ymm2\t\n"
- "vxorps ymm3,ymm3,ymm3\t\n"
- "vxorps ymm4,ymm4,ymm4\t\n"
- "vxorps ymm5,ymm5,ymm5\t\n"
- "vxorps ymm6,ymm6,ymm6\t\n"
- "vxorps ymm7,ymm7,ymm7\t\n"
- "vxorps ymm8,ymm8,ymm8\t\n"
- "vxorps ymm9,ymm9,ymm9\t\n"
- "vxorps ymm10,ymm10,ymm10\t\n"
-
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
-
- "loop_inner%=:\t\n"
-
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
+ "vfmadd231ps ymm8,ymm11,ymm10\t\n"
+ "vfmadd231ps ymm9,ymm12,ymm10\t\n"
+ "add r9,20\t\n"
+ "add r10,32\t\n"
"inc r14\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm5,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm6,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm7,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm8,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm9,ymm15,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+40]\t\n"
- "vfmadd231ps ymm10,ymm15,ymm11\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+44]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+48]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+52]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+56]\t\n"
- "vfmadd231ps ymm3,ymm14,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+60]\t\n"
- "vfmadd231ps ymm4,ymm14,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+64]\t\n"
- "vfmadd231ps ymm5,ymm14,ymm11\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+68]\t\n"
- "vfmadd231ps ymm6,ymm14,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+72]\t\n"
- "vfmadd231ps ymm7,ymm14,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+76]\t\n"
- "vfmadd231ps ymm8,ymm14,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+80]\t\n"
- "vfmadd231ps ymm9,ymm14,ymm11\t\n"
- "vbroadcastss ymm11,DWORD PTR [r9+84]\t\n"
- "vfmadd231ps ymm10,ymm14,ymm11\t\n"
- "add r9,88\t\n"
"cmp r14, r8\t\n"
"jl loop_inner%=\t\n"
"L_exit%=:\t\n"
- "add r10, rsi\t\n"
"cmp rdx, 1\t\n"
"je L_accum%=\t\n"
// Dump C
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm9\t\n"
"add r12, r13\t\n"
"jmp L_done%=\t\n"
@@ -1610,42 +577,34 @@ void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vbroadcastss ymm15,DWORD PTR [r15]\t\n"
"vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
+ "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
+ "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
+ "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm10,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n"
+ "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm9\t\n"
"add r12, r13\t\n"
"L_done%=:\t\n"
// next outer iteration
- "add rcx, 32\t\n"
+ "add rcx, 64\t\n"
"mov r12, rcx\t\n"
"mov r9, rax\t\n"
"inc rbx\t\n"
@@ -1669,7 +628,7 @@ void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"r12",
"memory");
}
-void __attribute__((noinline)) gemmkernel_12x1_AVX2_fA0fB0fC0(GemmParams* gp) {
+void __attribute__((noinline)) gemmkernel_6x2_AVX2_fA0fB0fC0(GemmParams* gp) {
asm volatile(
#if !defined(__clang__)
"mov r14, %[gp]\t\n"
@@ -1717,317 +676,57 @@ void __attribute__((noinline)) gemmkernel_12x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vxorps ymm10,ymm10,ymm10\t\n"
"vxorps ymm11,ymm11,ymm11\t\n"
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
"loop_inner%=:\t\n"
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
+ "vcvtph2ps ymm13,XMMWORD PTR [r10 + 0]\t\n"
+ "vcvtph2ps ymm14,XMMWORD PTR [r10 + 16]\t\n"
"vbroadcastss ymm12,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm5,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm6,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm7,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm8,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm9,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+40]\t\n"
- "vfmadd231ps ymm10,ymm15,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+44]\t\n"
- "vfmadd231ps ymm11,ymm15,ymm12\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+48]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+52]\t\n"
+ "vfmadd231ps ymm0,ymm13,ymm12\t\n"
"vfmadd231ps ymm1,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+56]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+60]\t\n"
+ "vbroadcastss ymm12,DWORD PTR [r9+4]\t\n"
+ "vfmadd231ps ymm2,ymm13,ymm12\t\n"
"vfmadd231ps ymm3,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+64]\t\n"
- "vfmadd231ps ymm4,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+68]\t\n"
+ "vbroadcastss ymm12,DWORD PTR [r9+8]\t\n"
+ "vfmadd231ps ymm4,ymm13,ymm12\t\n"
"vfmadd231ps ymm5,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+72]\t\n"
- "vfmadd231ps ymm6,ymm14,ymm12\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+76]\t\n"
+ "vbroadcastss ymm12,DWORD PTR [r9+12]\t\n"
+ "vfmadd231ps ymm6,ymm13,ymm12\t\n"
"vfmadd231ps ymm7,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+80]\t\n"
- "vfmadd231ps ymm8,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+84]\t\n"
+ "vbroadcastss ymm12,DWORD PTR [r9+16]\t\n"
+ "vfmadd231ps ymm8,ymm13,ymm12\t\n"
"vfmadd231ps ymm9,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+88]\t\n"
- "vfmadd231ps ymm10,ymm14,ymm12\t\n"
- "vbroadcastss ymm12,DWORD PTR [r9+92]\t\n"
+ "vbroadcastss ymm12,DWORD PTR [r9+20]\t\n"
+ "vfmadd231ps ymm10,ymm13,ymm12\t\n"
"vfmadd231ps ymm11,ymm14,ymm12\t\n"
- "add r9,96\t\n"
- "cmp r14, r8\t\n"
- "jl loop_inner%=\t\n"
-
- "L_exit%=:\t\n"
- "add r10, rsi\t\n"
-
- "cmp rdx, 1\t\n"
- "je L_accum%=\t\n"
- // Dump C
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n"
- "add r12, r13\t\n"
- "jmp L_done%=\t\n"
-
- "L_accum%=:\t\n"
- // Dump C with accumulate
- "vbroadcastss ymm15,DWORD PTR [r15]\t\n"
- "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm10,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm11,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n"
- "add r12, r13\t\n"
-
- "L_done%=:\t\n"
-
- // next outer iteration
- "add rcx, 32\t\n"
- "mov r12, rcx\t\n"
- "mov r9, rax\t\n"
- "inc rbx\t\n"
- "cmp rbx, rdi\t\n"
- "jl loop_outter%=\t\n"
- :
- : [gp] "rm"(gp)
- : "r8",
- "r9",
- "r10",
- "r11",
- "r15",
- "r13",
- "r14",
- "rax",
- "rcx",
- "rdx",
- "rsi",
- "rdi",
- "rbx",
- "r12",
- "memory");
-}
-void __attribute__((noinline)) gemmkernel_13x1_AVX2_fA0fB0fC0(GemmParams* gp) {
- asm volatile(
-#if !defined(__clang__)
- "mov r14, %[gp]\t\n"
-#else
- "mov %[gp], %%r14\t\n"
- ".intel_syntax noprefix\t\n"
-#endif
-
- // Copy parameters
- // k
- "mov r8, [r14 + 0]\t\n"
- // A
- "mov r9, [r14 + 8]\t\n"
- // B
- "mov r10, [r14 + 16]\t\n"
- // beta
- "mov r15, [r14 + 24]\t\n"
- // accum
- "mov rdx, [r14 + 32]\t\n"
- // C
- "mov r12, [r14 + 40]\t\n"
- // ldc
- "mov r13, [r14 + 48]\t\n"
- // b_block_cols
- "mov rdi, [r14 + 56]\t\n"
- // b_block_size
- "mov rsi, [r14 + 64]\t\n"
- // Make copies of A and C
- "mov rax, r9\t\n"
- "mov rcx, r12\t\n"
-
- "mov rbx, 0\t\n"
- "loop_outter%=:\t\n"
- "mov r14, 0\t\n"
- "vxorps ymm0,ymm0,ymm0\t\n"
- "vxorps ymm1,ymm1,ymm1\t\n"
- "vxorps ymm2,ymm2,ymm2\t\n"
- "vxorps ymm3,ymm3,ymm3\t\n"
- "vxorps ymm4,ymm4,ymm4\t\n"
- "vxorps ymm5,ymm5,ymm5\t\n"
- "vxorps ymm6,ymm6,ymm6\t\n"
- "vxorps ymm7,ymm7,ymm7\t\n"
- "vxorps ymm8,ymm8,ymm8\t\n"
- "vxorps ymm9,ymm9,ymm9\t\n"
- "vxorps ymm10,ymm10,ymm10\t\n"
- "vxorps ymm11,ymm11,ymm11\t\n"
- "vxorps ymm12,ymm12,ymm12\t\n"
-
- "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]\t\n"
- "mov r11, 16\t\n"
-
- "loop_inner%=:\t\n"
-
- "vcvtph2ps ymm14,XMMWORD PTR [r10 + r11 + 0]\t\n"
- "inc r14\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm5,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm6,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm7,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm8,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm9,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+40]\t\n"
- "vfmadd231ps ymm10,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+44]\t\n"
- "vfmadd231ps ymm11,ymm15,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+48]\t\n"
- "vfmadd231ps ymm12,ymm15,ymm13\t\n"
- "cmp r14, r8\t\n"
- "jge L_exit%=\t\n"
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11 + 16]\t\n"
+ "add r9,24\t\n"
+ "add r10,32\t\n"
"inc r14\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+52]\t\n"
- "vfmadd231ps ymm0,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+56]\t\n"
- "vfmadd231ps ymm1,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+60]\t\n"
- "vfmadd231ps ymm2,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+64]\t\n"
- "vfmadd231ps ymm3,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+68]\t\n"
- "vfmadd231ps ymm4,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+72]\t\n"
- "vfmadd231ps ymm5,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+76]\t\n"
- "vfmadd231ps ymm6,ymm14,ymm13\t\n"
- "add r11, 32\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+80]\t\n"
- "vfmadd231ps ymm7,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+84]\t\n"
- "vfmadd231ps ymm8,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+88]\t\n"
- "vfmadd231ps ymm9,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+92]\t\n"
- "vfmadd231ps ymm10,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+96]\t\n"
- "vfmadd231ps ymm11,ymm14,ymm13\t\n"
- "vbroadcastss ymm13,DWORD PTR [r9+100]\t\n"
- "vfmadd231ps ymm12,ymm14,ymm13\t\n"
- "add r9,104\t\n"
"cmp r14, r8\t\n"
"jl loop_inner%=\t\n"
"L_exit%=:\t\n"
- "add r10, rsi\t\n"
"cmp rdx, 1\t\n"
"je L_accum%=\t\n"
// Dump C
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm9\t\n"
"add r12, r13\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm10\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm12\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm11\t\n"
"add r12, r13\t\n"
"jmp L_done%=\t\n"
@@ -2036,244 +735,39 @@ void __attribute__((noinline)) gemmkernel_13x1_AVX2_fA0fB0fC0(GemmParams* gp) {
"vbroadcastss ymm15,DWORD PTR [r15]\t\n"
"vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
+ "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
+ "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
+ "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm5\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
+ "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm7\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
+ "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm9\t\n"
"add r12, r13\t\n"
"vfmadd231ps ymm10,ymm15,YMMWORD PTR [r12 + 0]\t\n"
"vmovups YMMWORD PTR [r12 + 0], ymm10\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm11,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm12,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm12\t\n"
- "add r12, r13\t\n"
-
- "L_done%=:\t\n"
-
- // next outer iteration
- "add rcx, 32\t\n"
- "mov r12, rcx\t\n"
- "mov r9, rax\t\n"
- "inc rbx\t\n"
- "cmp rbx, rdi\t\n"
- "jl loop_outter%=\t\n"
- :
- : [gp] "rm"(gp)
- : "r8",
- "r9",
- "r10",
- "r11",
- "r15",
- "r13",
- "r14",
- "rax",
- "rcx",
- "rdx",
- "rsi",
- "rdi",
- "rbx",
- "r12",
- "memory");
-}
-void __attribute__((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams* gp) {
- asm volatile(
-#if !defined(__clang__)
- "mov r14, %[gp]\t\n"
-#else
- "mov %[gp], %%r14\t\n"
- ".intel_syntax noprefix\t\n"
-#endif
-
- // Copy parameters
- // k
- "mov r8, [r14 + 0]\t\n"
- // A
- "mov r9, [r14 + 8]\t\n"
- // B
- "mov r10, [r14 + 16]\t\n"
- // beta
- "mov r15, [r14 + 24]\t\n"
- // accum
- "mov rdx, [r14 + 32]\t\n"
- // C
- "mov r12, [r14 + 40]\t\n"
- // ldc
- "mov r13, [r14 + 48]\t\n"
- // b_block_cols
- "mov rdi, [r14 + 56]\t\n"
- // b_block_size
- "mov rsi, [r14 + 64]\t\n"
- // Make copies of A and C
- "mov rax, r9\t\n"
- "mov rcx, r12\t\n"
-
- "mov rbx, 0\t\n"
- "loop_outter%=:\t\n"
- "mov r14, 0\t\n"
- "vxorps ymm0,ymm0,ymm0\t\n"
- "vxorps ymm1,ymm1,ymm1\t\n"
- "vxorps ymm2,ymm2,ymm2\t\n"
- "vxorps ymm3,ymm3,ymm3\t\n"
- "vxorps ymm4,ymm4,ymm4\t\n"
- "vxorps ymm5,ymm5,ymm5\t\n"
- "vxorps ymm6,ymm6,ymm6\t\n"
- "vxorps ymm7,ymm7,ymm7\t\n"
- "vxorps ymm8,ymm8,ymm8\t\n"
- "vxorps ymm9,ymm9,ymm9\t\n"
- "vxorps ymm10,ymm10,ymm10\t\n"
- "vxorps ymm11,ymm11,ymm11\t\n"
- "vxorps ymm12,ymm12,ymm12\t\n"
- "vxorps ymm13,ymm13,ymm13\t\n"
-
- "mov r11, 0\t\n"
-
- "loop_inner%=:\t\n"
-
- "vcvtph2ps ymm15,XMMWORD PTR [r10 + r11]\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+0]\t\n"
- "vfmadd231ps ymm0,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+4]\t\n"
- "vfmadd231ps ymm1,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+8]\t\n"
- "vfmadd231ps ymm2,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+12]\t\n"
- "vfmadd231ps ymm3,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+16]\t\n"
- "vfmadd231ps ymm4,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+20]\t\n"
- "vfmadd231ps ymm5,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+24]\t\n"
- "vfmadd231ps ymm6,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+28]\t\n"
- "vfmadd231ps ymm7,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+32]\t\n"
- "vfmadd231ps ymm8,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+36]\t\n"
- "vfmadd231ps ymm9,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+40]\t\n"
- "vfmadd231ps ymm10,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+44]\t\n"
- "vfmadd231ps ymm11,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+48]\t\n"
- "vfmadd231ps ymm12,ymm15,ymm14\t\n"
- "vbroadcastss ymm14,DWORD PTR [r9+52]\t\n"
- "vfmadd231ps ymm13,ymm15,ymm14\t\n"
- "add r9,56\t\n"
- "add r11, 16\t\n"
- "inc r14\t\n"
- "cmp r14, r8\t\n"
- "jl loop_inner%=\t\n"
- "add r10, rsi\t\n"
-
- "cmp rdx, 1\t\n"
- "je L_accum%=\t\n"
- // Dump C
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm12\t\n"
- "add r12, r13\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm13\t\n"
- "add r12, r13\t\n"
- "jmp L_done%=\t\n"
-
- "L_accum%=:\t\n"
- // Dump C with accumulate
- "vbroadcastss ymm15,DWORD PTR [r15]\t\n"
- "vfmadd231ps ymm0,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm0\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm1,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm1\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm2,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm2\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm3,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm3\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm4,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm4\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm5,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm5\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm6,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm6\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm7,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm7\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm8,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm8\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm9,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm9\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm10,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm10\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm11,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm11\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm12,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm12\t\n"
- "add r12, r13\t\n"
- "vfmadd231ps ymm13,ymm15,YMMWORD PTR [r12 + 0]\t\n"
- "vmovups YMMWORD PTR [r12 + 0], ymm13\t\n"
+ "vfmadd231ps ymm11,ymm15,YMMWORD PTR [r12 + 32]\t\n"
+ "vmovups YMMWORD PTR [r12 + 32], ymm11\t\n"
"add r12, r13\t\n"
"L_done%=:\t\n"
// next outer iteration
- "add rcx, 32\t\n"
+ "add rcx, 64\t\n"
"mov r12, rcx\t\n"
"mov r9, rax\t\n"
"inc rbx\t\n"
diff --git a/src/FbgemmFP16UKernelsAvx2.h b/src/FbgemmFP16UKernelsAvx2.h
index 4053332..6e7dfbc 100644
--- a/src/FbgemmFP16UKernelsAvx2.h
+++ b/src/FbgemmFP16UKernelsAvx2.h
@@ -24,20 +24,12 @@ struct GemmParams {
uint64_t b_block_cols;
uint64_t b_block_size;
};
-void __attribute__((noinline)) gemmkernel_1x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_2x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_3x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_4x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_5x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_6x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_7x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_8x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_10x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_12x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_13x1_AVX2_fA0fB0fC0(GemmParams* gp);
-void __attribute__((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams* gp);
+void __attribute__((noinline)) gemmkernel_1x2_AVX2_fA0fB0fC0(GemmParams* gp);
+void __attribute__((noinline)) gemmkernel_2x2_AVX2_fA0fB0fC0(GemmParams* gp);
+void __attribute__((noinline)) gemmkernel_3x2_AVX2_fA0fB0fC0(GemmParams* gp);
+void __attribute__((noinline)) gemmkernel_4x2_AVX2_fA0fB0fC0(GemmParams* gp);
+void __attribute__((noinline)) gemmkernel_5x2_AVX2_fA0fB0fC0(GemmParams* gp);
+void __attribute__((noinline)) gemmkernel_6x2_AVX2_fA0fB0fC0(GemmParams* gp);
typedef void (*funcptr_fp16)(GemmParams* gp);
;
diff --git a/src/codegen_fp16fp32.cc b/src/codegen_fp16fp32.cc
index 17bb113..7c8e10c 100644
--- a/src/codegen_fp16fp32.cc
+++ b/src/codegen_fp16fp32.cc
@@ -47,20 +47,35 @@ int main() {
{2,
"AVX2",
{
- {1, 1, 0},
- {2, 1, 0},
- {3, 1, 0},
- {4, 1, 0},
- {5, 1, 0},
- {6, 1, 0},
- {7, 1, 0},
- {8, 1, 0},
- {9, 1, 0},
- {10, 1, 0},
- {11, 1, 0},
- {12, 1, 0},
- {13, 1, 0},
- {14, 1, 0},
+ // 4x3 register layout
+ // {1, 3, 0},
+ // {2, 3, 0},
+ // {3, 3, 0},
+ // {4, 3, 0},
+
+ // 6x2 register layout
+ {1, 2, 0},
+ {2, 2, 0},
+ {3, 2, 0},
+ {4, 2, 0},
+ {5, 2, 0},
+ {6, 2, 0},
+
+ // 14x1 register layout
+ // {1, 1, 0},
+ // {2, 1, 0},
+ // {3, 1, 0},
+ // {4, 1, 0},
+ // {5, 1, 0},
+ // {6, 1, 0},
+ // {7, 1, 0},
+ // {8, 1, 0},
+ // {9, 1, 0},
+ // {10, 1, 0},
+ // {11, 1, 0},
+ // {12, 1, 0},
+ // {13, 1, 0},
+ // {14, 1, 0},
}}};
// open all files
@@ -159,7 +174,6 @@ int main() {
string vAtmp = "ymm" + to_string(last_free_ymmreg++);
// produce register block of B col
- assert(ukernel_shape[k][1] == 1);
vector<string> vBcol(ukernel_shape[k][1]);
for (auto c = 0; c < ukernel_shape[k][1]; c++) {
@@ -228,82 +242,50 @@ int main() {
srcfile << "\n";
- if (ukernel_shape[k][0] <= 13) {
- addi(srcfile, "vcvtph2ps ymm15, XMMWORD PTR [r10 + 0]");
- addi(srcfile, "mov r11, 16");
- } else {
- addi(srcfile, "mov r11, 0");
- }
-
srcfile << "\n";
string label = "loop_inner%=";
addi(srcfile, label + ":");
srcfile << "\n";
- if (ukernel_shape[k][0] <= 13) {
- auto a_offset = 0, unroll_factor = 2;
- for (auto u = 0; u < unroll_factor; u++) {
- string breg = (u == 0) ? "ymm14" : "ymm15";
- string breg_rev = (u == 0) ? "ymm15" : "ymm14";
-
- addi(
- srcfile,
- "vcvtph2ps " + breg + ",XMMWORD PTR [r10 + r11 + " +
- to_string(u * 16) + "]");
- addi(srcfile, "inc r14");
- for (auto r = 0; r < vCtile.size(); r++) {
- addi(
- srcfile,
- "vbroadcastss " + vAtmp + ",DWORD PTR [r9+" +
- to_string(a_offset) + "]");
- addi(
- srcfile,
- "vfmadd231ps " + vCtile[r][0] + "," + breg_rev + "," +
- vAtmp);
- if (u == 1 && r == vCtile.size() / 2)
- addi(srcfile, "add r11, 32");
- a_offset += 4;
- }
- if (u < unroll_factor - 1) {
- addi(srcfile, "cmp r14, r8");
- addi(srcfile, "jge " + exitlabel);
- }
- }
-
- addi(srcfile, "add r9," + to_string(a_offset));
- addi(srcfile, "cmp r14, r8");
- addi(srcfile, "jl " + label);
-
- srcfile << "\n";
+ for (int c = 0; c < vCtile[0].size(); c++) {
+ addi(
+ srcfile,
+ "vcvtph2ps " + vBcol[c] + ",XMMWORD PTR [r10 + " +
+ to_string(16 * c) + "]");
+ }
- addi(srcfile, exitlabel + ":");
- } else {
+ for (int r = 0; r < vCtile.size(); r++) {
addi(
srcfile,
- "vcvtph2ps " + vBcol[0] + ",XMMWORD PTR [r10 + r11]");
- for (auto r = 0; r < vCtile.size(); r++) {
+ "vbroadcastss " + vAtmp + ",DWORD PTR [r9+" +
+ to_string(4 * r) + "]");
+ for (int c = 0; c < vCtile[0].size(); c++) {
addi(
srcfile,
- "vbroadcastss " + vAtmp + ",DWORD PTR [r9+" +
- to_string(4 * r) + "]");
- addi(
- srcfile,
- "vfmadd231ps " + vCtile[r][0] + "," + vBcol[0] + "," +
+ "vfmadd231ps " + vCtile[r][c] + "," + vBcol[c] + "," +
vAtmp);
}
+ }
- addi(
- srcfile,
- "add r9," + to_string(4 * ukernel_shape[k][0]),
- fixedA); // move A ptr
- addi(srcfile, "add r11, 16");
+ addi(
+ srcfile,
+ "add r9," + to_string(4 * ukernel_shape[k][0]),
+ fixedA); // move A ptr
- addi(srcfile, "inc r14");
- addi(srcfile, "cmp r14, r8");
- addi(srcfile, "jl " + label);
- }
+ addi(
+ srcfile,
+ "add r10," + to_string(16 * ukernel_shape[k][1]),
+ fixedA); // move A ptr
+
+ addi(srcfile, "inc r14");
+ addi(srcfile, "cmp r14, r8");
+ addi(srcfile, "jl " + label);
+
+ srcfile << "\n";
+
+ addi(srcfile, exitlabel + ":");
- addi(srcfile, "add r10, rsi");
+ // addi(srcfile, "add r10, rsi");
srcfile << "\n";
// end marker