Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMateusz Chudyk <mateuszchudyk@gmail.com>2020-04-25 12:27:29 +0300
committerMateusz Chudyk <mateuszchudyk@gmail.com>2020-04-25 12:36:35 +0300
commit578f6655099903b18173c1f301a2f4e55ad1b16d (patch)
tree31d9b5ce5c8a5561322c0b2eed793b4a6ccf4cad
parent6377ee4d9f051d7be0c9c290bb33ab66f27ea900 (diff)
Add SelectColumnsB for column major orderstatic-selectcolumnsb-colmajor
-rw-r--r--avx2_gemm.h9
-rw-r--r--avx512_gemm.h7
-rw-r--r--interleave.h13
-rw-r--r--sse2_gemm.h5
-rw-r--r--ssse3_gemm.h4
5 files changed, 38 insertions, 0 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 68eb37e..0403bca 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -18,6 +18,7 @@ INTGEMM_AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant
}
INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
+INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_AVX2, __m256i)
class QuantizeTile16 {
public:
@@ -90,6 +91,10 @@ struct AVX2_16bit {
avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
}
+ INTGEMM_AVX2 static void SelectColumnsB_ColumnMajor(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ avx2::SelectColumnsOfB_ColumnMajor((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
+ }
+
INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2)
constexpr static const char *const kName = "16-bit AVX2";
@@ -236,6 +241,10 @@ struct AVX2_8bit {
avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
}
+ INTGEMM_AVX2 static void SelectColumnsB_ColumnMajor(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ avx2::SelectColumnsOfB_ColumnMajor((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
+ }
+
INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, CPUType::AVX2)
INTGEMM_MULTIPLY8SHIFT(__m256i, INTGEMM_AVX2, CPUType::AVX2)
diff --git a/avx512_gemm.h b/avx512_gemm.h
index a0087b3..efa2c13 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -45,6 +45,7 @@ INTGEMM_AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 q
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_SELECT_COL_B(INTGEMM_AVX512BW, __m512i)
+INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_AVX512BW, __m512i)
// For PrepareB we want to read 8 columns at a time. When converting 32-bit
// floats to 8-bit values, that's 32 bytes of floats. But AVX512 is 64 bytes
@@ -208,6 +209,9 @@ struct AVX512_16bit {
INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
}
+ INTGEMM_AVX512BW static void SelectColumnsB_ColumnMajor(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ avx512f::SelectColumnsOfB_ColumnMajor((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
+ }
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, CPUType::AVX2)
@@ -316,6 +320,9 @@ struct AVX512_8bit {
INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
}
+ INTGEMM_AVX512BW static void SelectColumnsB_ColumnMajor(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ avx512f::SelectColumnsOfB_ColumnMajor((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
+ }
// Special AVX512 implementation due to having 32 registers (so I don't have to
// allocate registers manually) and no sign instruction.
diff --git a/interleave.h b/interleave.h
index 231be46..cd5002e 100644
--- a/interleave.h
+++ b/interleave.h
@@ -314,4 +314,17 @@ target static inline void SelectColumnsOfB(const Register *input, Register *outp
} \
}
+#define INTGEMM_SELECT_COL_B_COLUMN_MAJOR(target, Register) \
+target static inline void SelectColumnsOfB_ColumnMajor(const Register *input, Register *output, Index rows_bytes /* number of bytes in a row */, const Index *cols_begin, const Index *cols_end) { \
+ assert(rows_bytes % sizeof(Register) == 0); \
+ assert((cols_end - cols_begin) % 8 == 0); \
+ /* Do columns for multiples of 8.*/ \
+ int register_rows = rows_bytes / sizeof(Register); \
+ for (; cols_begin != cols_end; ++cols_begin) { \
+ const Register *it = input + (*cols_begin & 7) * register_rows; \
+ for (int r = 0; r < register_rows; ++r) \
+ *output++ = *it++; \
+ } \
+}
+
} // namespace intgemm
diff --git a/sse2_gemm.h b/sse2_gemm.h
index 8b8f1c2..f258b62 100644
--- a/sse2_gemm.h
+++ b/sse2_gemm.h
@@ -18,6 +18,7 @@ INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant
}
INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
+INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_SSE2, __m128i)
class QuantizeTile16 {
public:
@@ -88,6 +89,10 @@ struct SSE2_16bit {
//TODO #DEFINE
sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
}
+ INTGEMM_SSE2 static void SelectColumnsB_ColumnMajor(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ //TODO #DEFINE
+ sse2::SelectColumnsOfB_ColumnMajor((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
+ }
INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, CPUType::SSE2)
constexpr static const char *const kName = "16-bit SSE2";
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index fd3ab8c..5841541 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -20,6 +20,7 @@ INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quan
}
INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
+INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_SSSE3, __m128i)
class QuantizeTile8 {
public:
@@ -149,6 +150,9 @@ struct SSSE3_8bit {
INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
}
+ INTGEMM_SSSE3 static void SelectColumnsB_ColumnsMajor(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ ssse3::SelectColumnsOfB_ColumnMajor((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
+ }
INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)