diff options
author | Mateusz Chudyk <mateuszchudyk@gmail.com> | 2020-01-13 20:38:42 +0300 |
---|---|---|
committer | Mateusz Chudyk <mateuszchudyk@gmail.com> | 2020-01-14 22:13:49 +0300 |
commit | 0b7e38fd087834a50618fdfcfbc3f1c98d644ad7 (patch) | |
tree | b5800a6dbe1c49ea9d45f9b0c242bdb17d9e64ab | |
parent | 5960e85f44984896ae78e8742f35f289bf43fe1f (diff) |
Add 8 bit rearragement for SSSE3 and AVX2rearrangement-b
-rw-r--r-- | avx2_gemm.h | 2 | ||||
-rw-r--r-- | interleave.h | 74 | ||||
-rw-r--r-- | intrinsics.h | 10 | ||||
-rw-r--r-- | ssse3_gemm.h | 2 | ||||
-rw-r--r-- | test/rearrangement_b_test.cc | 4 |
5 files changed, 90 insertions, 2 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h index 7d240b9..88e74e9 100644 --- a/avx2_gemm.h +++ b/avx2_gemm.h @@ -210,6 +210,8 @@ struct AVX2_8bit { static const Index kBTileRow = 32; static const Index kBTileCol = 8; + INTGEMM_REARRANGEMENT_B_8(INTGEMM_AVX2, CPUType::AVX2) + INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8) INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { diff --git a/interleave.h b/interleave.h index 5f9d525..4b3179b 100644 --- a/interleave.h +++ b/interleave.h @@ -28,6 +28,13 @@ INTGEMM_INTERLEAVE_N(target, type, 64) INTGEMM_INTERLEAVE(INTGEMM_SSE2, __m128i) INTGEMM_INTERLEAVE(INTGEMM_AVX2, __m256i) + +INTGEMM_AVX2 static inline void Interleave128(__m256i& first, __m256i& second) { + auto temp = _mm256_permute2f128_si256(first, second, 0x20); + second = _mm256_permute2f128_si256(first, second, 0x31); + first = temp; +} + #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 INTGEMM_INTERLEAVE(INTGEMM_AVX512BW, __m512i) #endif @@ -144,6 +151,73 @@ template <class Register> static inline void Transpose8InLane( r11 = tmp; } +static inline void Interleave(Index bytes, __m128i& first, __m128i& second) { + switch (bytes) { + case 1: Interleave8(first, second); break; + case 2: Interleave16(first, second); break; + case 4: Interleave32(first, second); break; + case 8: Interleave64(first, second); break; + default: break; + } +} + +static inline void Interleave(Index bytes, __m256i& first, __m256i& second) { + switch (bytes) { + case 1: Interleave8(first, second); break; + case 2: Interleave16(first, second); break; + case 4: Interleave32(first, second); break; + case 8: Interleave64(first, second); break; + case 16: Interleave128(first, second); break; + default: break; + } +} + +template <typename Integer, typename Register> +static inline void Transpose(Register* ptr, Index step) { + for (Index bytes = sizeof(Integer); bytes < sizeof(Register); bytes *= 2) + for (Index i = 0; i < sizeof(Register) / bytes; ++i) + for (Index j = 0; j < bytes; ++j) + Interleave(bytes, ptr[(2 * bytes * i + j) * step], ptr[(2 * bytes * i + j + bytes) * step]); + + for (Index i = 0; i < sizeof(Register) / sizeof(__m128i); ++i, ptr += sizeof(__m128i)) + { + Swap(ptr[1 * step], ptr[8 * step]); + Swap(ptr[3 * step], ptr[10 * step]); + Swap(ptr[5 * step], ptr[12 * step]); + Swap(ptr[7 * step], ptr[14 * step]); + Swap(ptr[2 * step], ptr[4 * step]); + Swap(ptr[3 * step], ptr[5 * step]); + Swap(ptr[10 * step], ptr[12 * step]); + Swap(ptr[11 * step], ptr[13 * step]); + } +} + +#define INTGEMM_REARRANGEMENT_B_8(target, cpu_type) \ +target static inline void RearrangementB(const int8_t* input, int8_t* output, Index rows, Index cols) { \ + using Register = vector_t<cpu_type, int8_t>; \ + static Register temp[sizeof(Register)]; \ + const Index kColStride = 8; \ + \ + assert(cols % sizeof(Register) == 0); \ + assert(rows % sizeof(Register) == 0); \ + assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ + assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ + \ + Register* output_it = reinterpret_cast<Register*>(output); \ + for (Index c = 0; c < cols; c += sizeof(Register), output_it += rows) { \ + for (Index r = 0; r < rows; r += sizeof(Register)) { \ + for (Index i = 0; i < sizeof(Register); ++i) \ + temp[i] = loadu_si(reinterpret_cast<const Register*>(input + cols * (r + i) + c)); \ + \ + Transpose<int8_t>(temp, 1); \ + \ + for (Index i = 0; i < sizeof(Register) / kColStride; ++i) \ + for (Index j = 0; j < kColStride; ++j) \ + output_it[(r + i * rows) * kColStride / sizeof(Register) + j] = temp[i * kColStride + j]; \ + } \ + } \ +} + // PREPARE B: quantize and rearrange. B is presumed to be constantparameters // so we can take our time rearranging it in order to save during the multiply. // diff --git a/intrinsics.h b/intrinsics.h index 5fe3159..fa852f5 100644 --- a/intrinsics.h +++ b/intrinsics.h @@ -18,6 +18,7 @@ namespace intgemm { */ template <class Register> static inline Register load_ps(float const* from); template <class Register> static inline Register loadu_ps(const float* mem_addr); +template <class Register> static inline Register loadu_si(const Register* mem_addr); template <class Register> static inline Register set1_epi16(int16_t to); template <class Register> static inline Register set1_epi32(int32_t to); template <class Register> static inline Register set1_epi8(int8_t to); @@ -80,6 +81,9 @@ template <> INTGEMM_SSE2 inline __m128 load_ps<__m128>(const float* from) { template <> INTGEMM_SSE2 inline __m128 loadu_ps(const float* mem_addr) { return _mm_loadu_ps(mem_addr); } +template <> INTGEMM_SSE2 inline __m128i loadu_si(const __m128i* mem_addr) { + return _mm_loadu_si128(mem_addr); +} INTGEMM_SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) { return _mm_madd_epi16(first, second); } @@ -252,6 +256,9 @@ INTGEMM_AVX2 static inline __m256 i32gather_ps(float const *base_addr, __m256i v template <> INTGEMM_AVX2 inline __m256 loadu_ps(const float* mem_addr) { return _mm256_loadu_ps(mem_addr); } +template <> INTGEMM_AVX2 inline __m256i loadu_si(const __m256i* mem_addr) { + return _mm256_loadu_si256(mem_addr); +} template <> INTGEMM_AVX2 inline __m256 load_ps<__m256>(const float* from) { return _mm256_load_ps(from); } @@ -429,6 +436,9 @@ INTGEMM_AVX512BW static inline __m512 i32gather_ps(float const *base_addr, __m51 template <> INTGEMM_AVX512BW inline __m512 loadu_ps(const float* mem_addr) { return _mm512_loadu_ps(mem_addr); } +template <> INTGEMM_AVX512BW inline __m512i loadu_si(const __m512i* mem_addr) { + return _mm512_loadu_si512(mem_addr); +} INTGEMM_AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) { return _mm512_madd_epi16(first, second); } diff --git a/ssse3_gemm.h b/ssse3_gemm.h index a2d74dd..0728ef3 100644 --- a/ssse3_gemm.h +++ b/ssse3_gemm.h @@ -135,6 +135,8 @@ struct SSSE3_8bit { static const Index kBTileRow = 16; static const Index kBTileCol = 8; + INTGEMM_REARRANGEMENT_B_8(INTGEMM_SSSE3, CPUType::SSE2) + INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8) INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { diff --git a/test/rearrangement_b_test.cc b/test/rearrangement_b_test.cc index b948f61..e23b2ef 100644 --- a/test/rearrangement_b_test.cc +++ b/test/rearrangement_b_test.cc @@ -71,14 +71,14 @@ TEST_CASE("RearrangementB SSSE3", "") { if (kCPU < CPUType::SSSE3) return; - // CHECK(TestMany<SSSE3_8bit>()); + CHECK(TestMany<SSSE3_8bit>()); } TEST_CASE("RearrangementB AVX2", "") { if (kCPU < CPUType::AVX2) return; - // CHECK(TestMany<AVX2_8bit>()); + CHECK(TestMany<AVX2_8bit>()); // CHECK(TestMany<AVX2_16bit>()); } |