Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2020-04-02 02:09:21 +0300
committerKenneth Heafield <github@kheafield.com>2020-04-02 02:09:21 +0300
commit257915109fc60d6c5e672477809264917fdf6c63 (patch)
tree5d17cc10239ff63736f9097899a49362171c4a13
parent58a75a9f540c63adfffc8caf20ed5804b7c74d3d (diff)
Reduction within 128-bit lanes
-rw-r--r--test/tile_test.cc5
-rw-r--r--test/tile_test.inl39
-rw-r--r--tile/reduce.h12
-rw-r--r--tile/reduce.inl19
4 files changed, 73 insertions, 2 deletions
diff --git a/test/tile_test.cc b/test/tile_test.cc
index 06dbe96..2eb878a 100644
--- a/test/tile_test.cc
+++ b/test/tile_test.cc
@@ -1,9 +1,10 @@
+#include "../aligned.h"
#include "../tile/access.h"
#include "../tile/dot.h"
#include "../tile/reduce.h"
-
#include "test.h"
-#include "../aligned.h"
+
+#include <random>
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
#define INTGEMM_THIS_IS_AVX512VNNI
diff --git a/test/tile_test.inl b/test/tile_test.inl
index c2e4257..98a1300 100644
--- a/test/tile_test.inl
+++ b/test/tile_test.inl
@@ -57,6 +57,45 @@ TEST_CASE("Basic Tile " INTGEMM_TEST_NAME, "[tile]") {
}
}
+INTGEMM_TARGET void DumpRegister(Register reg) {
+ int32_t values[sizeof(Register) / sizeof(int32_t)];
+ memcpy(values, &reg, sizeof(Register));
+ for (std::size_t i = 0; i < sizeof(Register) / sizeof(int32_t); ++i) {
+ std::cout.width(11);
+ std::cout << values[i] << ' ';
+ }
+}
+
+INTGEMM_TARGET void Pack32Test() {
+ const std::size_t kPack = sizeof(Register) / sizeof(int32_t);
+ Register regs[kPack];
+ std::mt19937 gen;
+ //std::uniform_int_distribution<int32_t> dist(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max());
+ std::uniform_int_distribution<int32_t> dist(0, 100);
+ std::vector<int32_t> reference(kPack, 0);
+ for (std::size_t i = 0; i < kPack; ++i) {
+ int32_t temp[kPack];
+ for (std::size_t j = 0; j < kPack; ++j) {
+ temp[j] = dist(gen);
+ reference[j] += temp[j];
+ }
+ memcpy(&regs[i], temp, sizeof(Register));
+ }
+ Register *indirect = regs;
+ for (std::size_t i = 0; i < 4; ++i) {
+ DumpRegister(indirect[i]);
+ std::cout << '\n';
+ }
+ Pack32<3, Sum32Op>(indirect);
+ DumpRegister(indirect[0]);
+ std::cout << '\n';
+}
+
+TEST_CASE("Reduce " INTGEMM_TEST_NAME, "[tile]") {
+ if (kCPU >= CPUType::INTGEMM_ARCH)
+ Pack32Test();
+}
+
} // namespace INTGEMM_ARCH
} // namespace intgemm
diff --git a/tile/reduce.h b/tile/reduce.h
index ecfcecc..0ab4e0d 100644
--- a/tile/reduce.h
+++ b/tile/reduce.h
@@ -37,3 +37,15 @@ struct Sum32Op {
#define INTGEMM_THIS_IS_SSE2
#include "reduce.inl"
#undef INTGEMM_THIS_IS_SSE2
+
+namespace intgemm {
+
+namespace SSSE3 {
+using SSE2::Pack32;
+} // namespace SSSE3
+
+namespace AVX512VNNI {
+using AVX512BW::Pack32;
+} // namespace AVX512VNNI
+
+} // namespace intgemm
diff --git a/tile/reduce.inl b/tile/reduce.inl
index f902f04..3bfa899 100644
--- a/tile/reduce.inl
+++ b/tile/reduce.inl
@@ -14,6 +14,24 @@
namespace intgemm {
namespace INTGEMM_ARCH {
+template <class Op> struct Pack64Even {
+ template <class Iterator> INTGEMM_TARGET static inline void body(Register *regs) {
+ const Index i = Iterator::template I<0>();
+ Register hi = unpackhi_epi64(regs[2 * i], regs[2 * i + 1]);
+ Register lo = unpacklo_epi64(regs[2 * i], regs[2 * i + 1]);
+ regs[i] = Op::Run(hi, lo);
+ }
+};
+template <Index Valid, class Op> INTGEMM_TARGET static inline void Pack64(Register *regs) {
+ StaticLoop<Pack64Even<Op>, MakeStaticLoopIterator<Valid / 2>>(regs);
+ if (Valid & 1) {
+ // For the odd case, shuffle to form 0 g where g is garbage and 0 is accumlated.
+ Register shuffled = shuffle_epi32(regs[Valid - 1], 0xB0 /* CDAA */);
+ regs[Valid / 2] = Op::Run(shuffled, regs[Valid - 1]);
+ }
+ // Now [0, (Valid + 1) / 2) contains registers to pack with 128-bit interleaving.
+}
+
template <class Op> struct Pack32Even {
template <class Iterator> INTGEMM_TARGET static inline void body(Register *regs) {
const Index i = Iterator::template I<0>();
@@ -31,6 +49,7 @@ template <Index Valid, class Op> INTGEMM_TARGET static inline void Pack32(Regist
regs[Valid / 2] = Op::Run(shuffled, regs[Valid - 1]);
}
// Now [0, (Valid + 1) / 2) contains registers to pack with 64-bit interleaving.
+ Pack64<(Valid + 1) / 2, Op>(regs);
}
} // namespace INTGEMM_ARCH