diff options
author | Jianyu Huang <jianyuhuang@fb.com> | 2019-03-08 05:02:55 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-03-08 05:05:39 +0300 |
commit | 844dacc267391cd2a725d81c2495636f0765771b (patch) | |
tree | cb20d367ce086dfb4374a7794fa8d889ffead09b /src/FbgemmFP16UKernelsAvx2.h | |
parent | 66b41357561f2ff9895d2b4638273f07c49dbe29 (diff) |
Fixes for FBGEMM FP16 performance (#82)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/82
This is a quick fix for matching FBGEMM FP16 performance with SKINNY GEMM FP16.
Basically, this Diff switches the register layout in C accumulation buffer inside micro-kernel from MR * 1 to MR * 2. Check the reasons in T40816746.
Reviewed By: zhengwy888
Differential Revision: D14278430
fbshipit-source-id: 961dd681deee69e2b7fec6bcdba7920e0b09134a
Diffstat (limited to 'src/FbgemmFP16UKernelsAvx2.h')
-rw-r--r-- | src/FbgemmFP16UKernelsAvx2.h | 20 |
1 files changed, 6 insertions, 14 deletions
diff --git a/src/FbgemmFP16UKernelsAvx2.h b/src/FbgemmFP16UKernelsAvx2.h index 4053332..6e7dfbc 100644 --- a/src/FbgemmFP16UKernelsAvx2.h +++ b/src/FbgemmFP16UKernelsAvx2.h @@ -24,20 +24,12 @@ struct GemmParams { uint64_t b_block_cols; uint64_t b_block_size; }; -void __attribute__((noinline)) gemmkernel_1x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_2x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_3x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_4x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_5x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_6x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_7x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_8x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_9x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_10x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_11x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_12x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_13x1_AVX2_fA0fB0fC0(GemmParams* gp); -void __attribute__((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_1x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_2x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_3x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_4x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_5x2_AVX2_fA0fB0fC0(GemmParams* gp); +void __attribute__((noinline)) gemmkernel_6x2_AVX2_fA0fB0fC0(GemmParams* gp); typedef void (*funcptr_fp16)(GemmParams* gp); ; |