Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMateusz Chudyk <mateuszchudyk@gmail.com>2020-01-13 20:38:42 +0300
committerMateusz Chudyk <mateuszchudyk@gmail.com>2020-01-14 22:13:49 +0300
commit0b7e38fd087834a50618fdfcfbc3f1c98d644ad7 (patch)
treeb5800a6dbe1c49ea9d45f9b0c242bdb17d9e64ab
parent5960e85f44984896ae78e8742f35f289bf43fe1f (diff)
Add 8 bit rearragement for SSSE3 and AVX2rearrangement-b
-rw-r--r--avx2_gemm.h2
-rw-r--r--interleave.h74
-rw-r--r--intrinsics.h10
-rw-r--r--ssse3_gemm.h2
-rw-r--r--test/rearrangement_b_test.cc4
5 files changed, 90 insertions, 2 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 7d240b9..88e74e9 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -210,6 +210,8 @@ struct AVX2_8bit {
static const Index kBTileRow = 32;
static const Index kBTileCol = 8;
+ INTGEMM_REARRANGEMENT_B_8(INTGEMM_AVX2, CPUType::AVX2)
+
INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8)
INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
diff --git a/interleave.h b/interleave.h
index 5f9d525..4b3179b 100644
--- a/interleave.h
+++ b/interleave.h
@@ -28,6 +28,13 @@ INTGEMM_INTERLEAVE_N(target, type, 64)
INTGEMM_INTERLEAVE(INTGEMM_SSE2, __m128i)
INTGEMM_INTERLEAVE(INTGEMM_AVX2, __m256i)
+
+INTGEMM_AVX2 static inline void Interleave128(__m256i& first, __m256i& second) {
+ auto temp = _mm256_permute2f128_si256(first, second, 0x20);
+ second = _mm256_permute2f128_si256(first, second, 0x31);
+ first = temp;
+}
+
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512
INTGEMM_INTERLEAVE(INTGEMM_AVX512BW, __m512i)
#endif
@@ -144,6 +151,73 @@ template <class Register> static inline void Transpose8InLane(
r11 = tmp;
}
+static inline void Interleave(Index bytes, __m128i& first, __m128i& second) {
+ switch (bytes) {
+ case 1: Interleave8(first, second); break;
+ case 2: Interleave16(first, second); break;
+ case 4: Interleave32(first, second); break;
+ case 8: Interleave64(first, second); break;
+ default: break;
+ }
+}
+
+static inline void Interleave(Index bytes, __m256i& first, __m256i& second) {
+ switch (bytes) {
+ case 1: Interleave8(first, second); break;
+ case 2: Interleave16(first, second); break;
+ case 4: Interleave32(first, second); break;
+ case 8: Interleave64(first, second); break;
+ case 16: Interleave128(first, second); break;
+ default: break;
+ }
+}
+
+template <typename Integer, typename Register>
+static inline void Transpose(Register* ptr, Index step) {
+ for (Index bytes = sizeof(Integer); bytes < sizeof(Register); bytes *= 2)
+ for (Index i = 0; i < sizeof(Register) / bytes; ++i)
+ for (Index j = 0; j < bytes; ++j)
+ Interleave(bytes, ptr[(2 * bytes * i + j) * step], ptr[(2 * bytes * i + j + bytes) * step]);
+
+ for (Index i = 0; i < sizeof(Register) / sizeof(__m128i); ++i, ptr += sizeof(__m128i))
+ {
+ Swap(ptr[1 * step], ptr[8 * step]);
+ Swap(ptr[3 * step], ptr[10 * step]);
+ Swap(ptr[5 * step], ptr[12 * step]);
+ Swap(ptr[7 * step], ptr[14 * step]);
+ Swap(ptr[2 * step], ptr[4 * step]);
+ Swap(ptr[3 * step], ptr[5 * step]);
+ Swap(ptr[10 * step], ptr[12 * step]);
+ Swap(ptr[11 * step], ptr[13 * step]);
+ }
+}
+
+#define INTGEMM_REARRANGEMENT_B_8(target, cpu_type) \
+target static inline void RearrangementB(const int8_t* input, int8_t* output, Index rows, Index cols) { \
+ using Register = vector_t<cpu_type, int8_t>; \
+ static Register temp[sizeof(Register)]; \
+ const Index kColStride = 8; \
+ \
+ assert(cols % sizeof(Register) == 0); \
+ assert(rows % sizeof(Register) == 0); \
+ assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
+ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
+ \
+ Register* output_it = reinterpret_cast<Register*>(output); \
+ for (Index c = 0; c < cols; c += sizeof(Register), output_it += rows) { \
+ for (Index r = 0; r < rows; r += sizeof(Register)) { \
+ for (Index i = 0; i < sizeof(Register); ++i) \
+ temp[i] = loadu_si(reinterpret_cast<const Register*>(input + cols * (r + i) + c)); \
+ \
+ Transpose<int8_t>(temp, 1); \
+ \
+ for (Index i = 0; i < sizeof(Register) / kColStride; ++i) \
+ for (Index j = 0; j < kColStride; ++j) \
+ output_it[(r + i * rows) * kColStride / sizeof(Register) + j] = temp[i * kColStride + j]; \
+ } \
+ } \
+}
+
// PREPARE B: quantize and rearrange. B is presumed to be constantparameters
// so we can take our time rearranging it in order to save during the multiply.
//
diff --git a/intrinsics.h b/intrinsics.h
index 5fe3159..fa852f5 100644
--- a/intrinsics.h
+++ b/intrinsics.h
@@ -18,6 +18,7 @@ namespace intgemm {
*/
template <class Register> static inline Register load_ps(float const* from);
template <class Register> static inline Register loadu_ps(const float* mem_addr);
+template <class Register> static inline Register loadu_si(const Register* mem_addr);
template <class Register> static inline Register set1_epi16(int16_t to);
template <class Register> static inline Register set1_epi32(int32_t to);
template <class Register> static inline Register set1_epi8(int8_t to);
@@ -80,6 +81,9 @@ template <> INTGEMM_SSE2 inline __m128 load_ps<__m128>(const float* from) {
template <> INTGEMM_SSE2 inline __m128 loadu_ps(const float* mem_addr) {
return _mm_loadu_ps(mem_addr);
}
+template <> INTGEMM_SSE2 inline __m128i loadu_si(const __m128i* mem_addr) {
+ return _mm_loadu_si128(mem_addr);
+}
INTGEMM_SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) {
return _mm_madd_epi16(first, second);
}
@@ -252,6 +256,9 @@ INTGEMM_AVX2 static inline __m256 i32gather_ps(float const *base_addr, __m256i v
template <> INTGEMM_AVX2 inline __m256 loadu_ps(const float* mem_addr) {
return _mm256_loadu_ps(mem_addr);
}
+template <> INTGEMM_AVX2 inline __m256i loadu_si(const __m256i* mem_addr) {
+ return _mm256_loadu_si256(mem_addr);
+}
template <> INTGEMM_AVX2 inline __m256 load_ps<__m256>(const float* from) {
return _mm256_load_ps(from);
}
@@ -429,6 +436,9 @@ INTGEMM_AVX512BW static inline __m512 i32gather_ps(float const *base_addr, __m51
template <> INTGEMM_AVX512BW inline __m512 loadu_ps(const float* mem_addr) {
return _mm512_loadu_ps(mem_addr);
}
+template <> INTGEMM_AVX512BW inline __m512i loadu_si(const __m512i* mem_addr) {
+ return _mm512_loadu_si512(mem_addr);
+}
INTGEMM_AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) {
return _mm512_madd_epi16(first, second);
}
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index a2d74dd..0728ef3 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -135,6 +135,8 @@ struct SSSE3_8bit {
static const Index kBTileRow = 16;
static const Index kBTileCol = 8;
+ INTGEMM_REARRANGEMENT_B_8(INTGEMM_SSSE3, CPUType::SSE2)
+
INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
diff --git a/test/rearrangement_b_test.cc b/test/rearrangement_b_test.cc
index b948f61..e23b2ef 100644
--- a/test/rearrangement_b_test.cc
+++ b/test/rearrangement_b_test.cc
@@ -71,14 +71,14 @@ TEST_CASE("RearrangementB SSSE3", "") {
if (kCPU < CPUType::SSSE3)
return;
- // CHECK(TestMany<SSSE3_8bit>());
+ CHECK(TestMany<SSSE3_8bit>());
}
TEST_CASE("RearrangementB AVX2", "") {
if (kCPU < CPUType::AVX2)
return;
- // CHECK(TestMany<AVX2_8bit>());
+ CHECK(TestMany<AVX2_8bit>());
// CHECK(TestMany<AVX2_16bit>());
}