diff options
author | Kenneth Heafield <github@kheafield.com> | 2020-04-02 02:09:21 +0300 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2020-04-02 02:09:21 +0300 |
commit | 257915109fc60d6c5e672477809264917fdf6c63 (patch) | |
tree | 5d17cc10239ff63736f9097899a49362171c4a13 | |
parent | 58a75a9f540c63adfffc8caf20ed5804b7c74d3d (diff) |
Reduction within 128-bit lanes
-rw-r--r-- | test/tile_test.cc | 5 | ||||
-rw-r--r-- | test/tile_test.inl | 39 | ||||
-rw-r--r-- | tile/reduce.h | 12 | ||||
-rw-r--r-- | tile/reduce.inl | 19 |
4 files changed, 73 insertions, 2 deletions
diff --git a/test/tile_test.cc b/test/tile_test.cc index 06dbe96..2eb878a 100644 --- a/test/tile_test.cc +++ b/test/tile_test.cc @@ -1,9 +1,10 @@ +#include "../aligned.h" #include "../tile/access.h" #include "../tile/dot.h" #include "../tile/reduce.h" - #include "test.h" -#include "../aligned.h" + +#include <random> #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI #define INTGEMM_THIS_IS_AVX512VNNI diff --git a/test/tile_test.inl b/test/tile_test.inl index c2e4257..98a1300 100644 --- a/test/tile_test.inl +++ b/test/tile_test.inl @@ -57,6 +57,45 @@ TEST_CASE("Basic Tile " INTGEMM_TEST_NAME, "[tile]") { } } +INTGEMM_TARGET void DumpRegister(Register reg) { + int32_t values[sizeof(Register) / sizeof(int32_t)]; + memcpy(values, ®, sizeof(Register)); + for (std::size_t i = 0; i < sizeof(Register) / sizeof(int32_t); ++i) { + std::cout.width(11); + std::cout << values[i] << ' '; + } +} + +INTGEMM_TARGET void Pack32Test() { + const std::size_t kPack = sizeof(Register) / sizeof(int32_t); + Register regs[kPack]; + std::mt19937 gen; + //std::uniform_int_distribution<int32_t> dist(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max()); + std::uniform_int_distribution<int32_t> dist(0, 100); + std::vector<int32_t> reference(kPack, 0); + for (std::size_t i = 0; i < kPack; ++i) { + int32_t temp[kPack]; + for (std::size_t j = 0; j < kPack; ++j) { + temp[j] = dist(gen); + reference[j] += temp[j]; + } + memcpy(®s[i], temp, sizeof(Register)); + } + Register *indirect = regs; + for (std::size_t i = 0; i < 4; ++i) { + DumpRegister(indirect[i]); + std::cout << '\n'; + } + Pack32<3, Sum32Op>(indirect); + DumpRegister(indirect[0]); + std::cout << '\n'; +} + +TEST_CASE("Reduce " INTGEMM_TEST_NAME, "[tile]") { + if (kCPU >= CPUType::INTGEMM_ARCH) + Pack32Test(); +} + } // namespace INTGEMM_ARCH } // namespace intgemm diff --git a/tile/reduce.h b/tile/reduce.h index ecfcecc..0ab4e0d 100644 --- a/tile/reduce.h +++ b/tile/reduce.h @@ -37,3 +37,15 @@ struct Sum32Op { #define INTGEMM_THIS_IS_SSE2 #include "reduce.inl" #undef INTGEMM_THIS_IS_SSE2 + +namespace intgemm { + +namespace SSSE3 { +using SSE2::Pack32; +} // namespace SSSE3 + +namespace AVX512VNNI { +using AVX512BW::Pack32; +} // namespace AVX512VNNI + +} // namespace intgemm diff --git a/tile/reduce.inl b/tile/reduce.inl index f902f04..3bfa899 100644 --- a/tile/reduce.inl +++ b/tile/reduce.inl @@ -14,6 +14,24 @@ namespace intgemm { namespace INTGEMM_ARCH { +template <class Op> struct Pack64Even { + template <class Iterator> INTGEMM_TARGET static inline void body(Register *regs) { + const Index i = Iterator::template I<0>(); + Register hi = unpackhi_epi64(regs[2 * i], regs[2 * i + 1]); + Register lo = unpacklo_epi64(regs[2 * i], regs[2 * i + 1]); + regs[i] = Op::Run(hi, lo); + } +}; +template <Index Valid, class Op> INTGEMM_TARGET static inline void Pack64(Register *regs) { + StaticLoop<Pack64Even<Op>, MakeStaticLoopIterator<Valid / 2>>(regs); + if (Valid & 1) { + // For the odd case, shuffle to form 0 g where g is garbage and 0 is accumlated. + Register shuffled = shuffle_epi32(regs[Valid - 1], 0xB0 /* CDAA */); + regs[Valid / 2] = Op::Run(shuffled, regs[Valid - 1]); + } + // Now [0, (Valid + 1) / 2) contains registers to pack with 128-bit interleaving. +} + template <class Op> struct Pack32Even { template <class Iterator> INTGEMM_TARGET static inline void body(Register *regs) { const Index i = Iterator::template I<0>(); @@ -31,6 +49,7 @@ template <Index Valid, class Op> INTGEMM_TARGET static inline void Pack32(Regist regs[Valid / 2] = Op::Run(shuffled, regs[Valid - 1]); } // Now [0, (Valid + 1) / 2) contains registers to pack with 64-bit interleaving. + Pack64<(Valid + 1) / 2, Op>(regs); } } // namespace INTGEMM_ARCH |