Add SelectColumnsB for column major orderstatic-selectcolumnsb-colmajor

author: Mateusz Chudyk <mateuszchudyk@gmail.com> 2020-04-25 12:27:29 +0300
committer: Mateusz Chudyk <mateuszchudyk@gmail.com> 2020-04-25 12:36:35 +0300
commit: 578f6655099903b18173c1f301a2f4e55ad1b16d (patch)
tree: 31d9b5ce5c8a5561322c0b2eed793b4a6ccf4cad
parent: 6377ee4d9f051d7be0c9c290bb33ab66f27ea900 (diff)
5 files changed, 38 insertions, 0 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 68eb37e..0403bca 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -18,6 +18,7 @@ INTGEMM_AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant
 }
 
 INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
+INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_AVX2, __m256i)
 
 class QuantizeTile16 {
   public:
@@ -90,6 +91,10 @@ struct AVX2_16bit {
     avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
   }
 
+  INTGEMM_AVX2 static void SelectColumnsB_ColumnMajor(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    avx2::SelectColumnsOfB_ColumnMajor((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
+  }
+
   INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2)
 
   constexpr static const char *const kName = "16-bit AVX2";
@@ -236,6 +241,10 @@ struct AVX2_8bit {
     avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
   }
 
+  INTGEMM_AVX2 static void SelectColumnsB_ColumnMajor(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    avx2::SelectColumnsOfB_ColumnMajor((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
+  }
+
   INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, CPUType::AVX2)
 
   INTGEMM_MULTIPLY8SHIFT(__m256i, INTGEMM_AVX2, CPUType::AVX2)
diff --git a/avx512_gemm.h b/avx512_gemm.h
index a0087b3..efa2c13 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -45,6 +45,7 @@ INTGEMM_AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 q
 
 /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
 INTGEMM_SELECT_COL_B(INTGEMM_AVX512BW, __m512i)
+INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_AVX512BW, __m512i)
 
 // For PrepareB we want to read 8 columns at a time.  When converting 32-bit
 // floats to 8-bit values, that's 32 bytes of floats.  But AVX512 is 64 bytes
@@ -208,6 +209,9 @@ struct AVX512_16bit {
   INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
   }
+  INTGEMM_AVX512BW static void SelectColumnsB_ColumnMajor(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    avx512f::SelectColumnsOfB_ColumnMajor((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
+  }
 
   /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
   INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, CPUType::AVX2)
@@ -316,6 +320,9 @@ struct AVX512_8bit {
   INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
   }
+  INTGEMM_AVX512BW static void SelectColumnsB_ColumnMajor(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    avx512f::SelectColumnsOfB_ColumnMajor((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
+  }
 
   // Special AVX512 implementation due to having 32 registers (so I don't have to
   // allocate registers manually) and no sign instruction.
diff --git a/interleave.h b/interleave.h
index 231be46..cd5002e 100644
--- a/interleave.h
+++ b/interleave.h
@@ -314,4 +314,17 @@ target static inline void SelectColumnsOfB(const Register *input, Register *outp
   } \
 }
 
+#define INTGEMM_SELECT_COL_B_COLUMN_MAJOR(target, Register) \
+target static inline void SelectColumnsOfB_ColumnMajor(const Register *input, Register *output, Index rows_bytes /* number of bytes in a row */, const Index *cols_begin, const Index *cols_end) { \
+  assert(rows_bytes % sizeof(Register) == 0); \
+  assert((cols_end - cols_begin) % 8 == 0);  \
+  /* Do columns for multiples of 8.*/ \
+  int register_rows = rows_bytes / sizeof(Register); \
+  for (; cols_begin != cols_end; ++cols_begin) { \
+    const Register *it = input + (*cols_begin & 7) * register_rows; \
+    for (int r = 0; r < register_rows; ++r) \
+      *output++ = *it++; \
+  } \
+}
+
 } // namespace intgemm
diff --git a/sse2_gemm.h b/sse2_gemm.h
index 8b8f1c2..f258b62 100644
--- a/sse2_gemm.h
+++ b/sse2_gemm.h
@@ -18,6 +18,7 @@ INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant
 }
 
 INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
+INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_SSE2, __m128i)
 
 class QuantizeTile16 {
   public:
@@ -88,6 +89,10 @@ struct SSE2_16bit {
     //TODO #DEFINE
     sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
   }
+   INTGEMM_SSE2 static void SelectColumnsB_ColumnMajor(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    //TODO #DEFINE
+    sse2::SelectColumnsOfB_ColumnMajor((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
+  }
   INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, CPUType::SSE2)
 
   constexpr static const char *const kName = "16-bit SSE2";
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index fd3ab8c..5841541 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -20,6 +20,7 @@ INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quan
 }
 
 INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
+INTGEMM_SELECT_COL_B_COLUMN_MAJOR(INTGEMM_SSSE3, __m128i)
 
 class QuantizeTile8 {
   public:
@@ -149,6 +150,9 @@ struct SSSE3_8bit {
   INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
   }
+  INTGEMM_SSSE3 static void SelectColumnsB_ColumnsMajor(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    ssse3::SelectColumnsOfB_ColumnMajor((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
+  }
 
   INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
author	Mateusz Chudyk <mateuszchudyk@gmail.com>	2020-04-25 12:27:29 +0300
committer	Mateusz Chudyk <mateuszchudyk@gmail.com>	2020-04-25 12:36:35 +0300
commit	578f6655099903b18173c1f301a2f4e55ad1b16d (patch)
tree	31d9b5ce5c8a5561322c0b2eed793b4a6ccf4cad
parent	6377ee4d9f051d7be0c9c290bb33ab66f27ea900 (diff)