diff options
author | Mateusz Chudyk <mateuszchudyk@gmail.com> | 2020-04-25 12:27:29 +0300 |
---|---|---|
committer | Mateusz Chudyk <mateuszchudyk@gmail.com> | 2020-04-25 12:36:35 +0300 |
commit | 578f6655099903b18173c1f301a2f4e55ad1b16d (patch) | |
tree | 31d9b5ce5c8a5561322c0b2eed793b4a6ccf4cad | |
parent | 6377ee4d9f051d7be0c9c290bb33ab66f27ea900 (diff) |
Add SelectColumnsB for column major orderstatic-selectcolumnsb-colmajor
-rw-r--r-- | avx2_gemm.h | 9 | ||||
-rw-r--r-- | avx512_gemm.h | 7 | ||||
-rw-r--r-- | interleave.h | 13 | ||||
-rw-r--r-- | sse2_gemm.h | 5 | ||||
-rw-r--r-- | ssse3_gemm.h | 4 |
5 files changed, 38 insertions, 0 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h index 68eb37e..0403bca 100644 --- a/avx2_gemm.h +++ b/avx2_gemm.h @@ -18,6 +18,7 @@ INTGEMM_AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant } INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i) +INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_AVX2, __m256i) class QuantizeTile16 { public: @@ -90,6 +91,10 @@ struct AVX2_16bit { avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); } + INTGEMM_AVX2 static void SelectColumnsB_ColumnMajor(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + avx2::SelectColumnsOfB_ColumnMajor((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); + } + INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2) constexpr static const char *const kName = "16-bit AVX2"; @@ -236,6 +241,10 @@ struct AVX2_8bit { avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); } + INTGEMM_AVX2 static void SelectColumnsB_ColumnMajor(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + avx2::SelectColumnsOfB_ColumnMajor((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); + } + INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, CPUType::AVX2) INTGEMM_MULTIPLY8SHIFT(__m256i, INTGEMM_AVX2, CPUType::AVX2) diff --git a/avx512_gemm.h b/avx512_gemm.h index a0087b3..efa2c13 100644 --- a/avx512_gemm.h +++ b/avx512_gemm.h @@ -45,6 +45,7 @@ INTGEMM_AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 q /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_SELECT_COL_B(INTGEMM_AVX512BW, __m512i) +INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_AVX512BW, __m512i) // For PrepareB we want to read 8 columns at a time. When converting 32-bit // floats to 8-bit values, that's 32 bytes of floats. But AVX512 is 64 bytes @@ -208,6 +209,9 @@ struct AVX512_16bit { INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end); } + INTGEMM_AVX512BW static void SelectColumnsB_ColumnMajor(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + avx512f::SelectColumnsOfB_ColumnMajor((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end); + } /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, CPUType::AVX2) @@ -316,6 +320,9 @@ struct AVX512_8bit { INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end); } + INTGEMM_AVX512BW static void SelectColumnsB_ColumnMajor(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + avx512f::SelectColumnsOfB_ColumnMajor((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end); + } // Special AVX512 implementation due to having 32 registers (so I don't have to // allocate registers manually) and no sign instruction. diff --git a/interleave.h b/interleave.h index 231be46..cd5002e 100644 --- a/interleave.h +++ b/interleave.h @@ -314,4 +314,17 @@ target static inline void SelectColumnsOfB(const Register *input, Register *outp } \ } +#define INTGEMM_SELECT_COL_B_COLUMN_MAJOR(target, Register) \ +target static inline void SelectColumnsOfB_ColumnMajor(const Register *input, Register *output, Index rows_bytes /* number of bytes in a row */, const Index *cols_begin, const Index *cols_end) { \ + assert(rows_bytes % sizeof(Register) == 0); \ + assert((cols_end - cols_begin) % 8 == 0); \ + /* Do columns for multiples of 8.*/ \ + int register_rows = rows_bytes / sizeof(Register); \ + for (; cols_begin != cols_end; ++cols_begin) { \ + const Register *it = input + (*cols_begin & 7) * register_rows; \ + for (int r = 0; r < register_rows; ++r) \ + *output++ = *it++; \ + } \ +} + } // namespace intgemm diff --git a/sse2_gemm.h b/sse2_gemm.h index 8b8f1c2..f258b62 100644 --- a/sse2_gemm.h +++ b/sse2_gemm.h @@ -18,6 +18,7 @@ INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant } INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i) +INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_SSE2, __m128i) class QuantizeTile16 { public: @@ -88,6 +89,10 @@ struct SSE2_16bit { //TODO #DEFINE sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end); } + INTGEMM_SSE2 static void SelectColumnsB_ColumnMajor(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + //TODO #DEFINE + sse2::SelectColumnsOfB_ColumnMajor((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end); + } INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, CPUType::SSE2) constexpr static const char *const kName = "16-bit SSE2"; diff --git a/ssse3_gemm.h b/ssse3_gemm.h index fd3ab8c..5841541 100644 --- a/ssse3_gemm.h +++ b/ssse3_gemm.h @@ -20,6 +20,7 @@ INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quan } INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i) +INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_SSSE3, __m128i) class QuantizeTile8 { public: @@ -149,6 +150,9 @@ struct SSSE3_8bit { INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); } + INTGEMM_SSSE3 static void SelectColumnsB_ColumnsMajor(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + ssse3::SelectColumnsOfB_ColumnMajor((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); + } INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2) |