Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2020-04-25 12:41:49 +0300
committerKenneth Heafield <github@kheafield.com>2020-04-25 12:41:49 +0300
commit7b5c9e642fa0f84e10167ab1f4a7850ae5496eb9 (patch)
tree8dab4c8d186cf8b7a4d3bbc6ded1ee2d37b04ca3
parent6377ee4d9f051d7be0c9c290bb33ab66f27ea900 (diff)
Memoising benchmark program to decide tile size, but it likes 1x16
-rw-r--r--benchmarks/benchmark_tile.cc246
-rw-r--r--types.h3
2 files changed, 234 insertions, 15 deletions
diff --git a/benchmarks/benchmark_tile.cc b/benchmarks/benchmark_tile.cc
index b7a5981..a528837 100644
--- a/benchmarks/benchmark_tile.cc
+++ b/benchmarks/benchmark_tile.cc
@@ -5,41 +5,257 @@
#include "../tile/dot.h"
#include <chrono>
-#include <iomanip>
+#include <functional>
+#include <limits>
#include <random>
#include <vector>
+#include <stdio.h>
+
+#include <unordered_map>
namespace intgemm {
namespace {
typedef TestMatrices8::AccessT Accessor;
-template <Index A_rows, Index B_cols> static inline void BenchmarkOne(Accessor access, Tile shape) {
- const std::size_t kTries = 4;
+// What do we do with the overhang? Include in the bottom, include in right, or separate call?
+enum Overhang {
+ BOTTOM,
+ RIGHT,
+ SEPARATE,
+ NONE
+};
+
+/* Entry in the memoisation table. It has three levels of validity:
+ * Blank:
+ * direct_time == std::numeric_limits<double>::infinity()
+ * recursive_time == std::numeric_limits<double>::infinity()
+ * Direct evaluated, but not splits:
+ * direct_time valid
+ * recursive_time == std::numeric_limits<double>::infinity()
+ * This is useful e.g. if the size is being considered for big evaluation.
+ * Full:
+ * direct_time valid
+ * recursive_time valid
+ It's been evaluated with splitting into big, right, bottom, and bottom right corner etc
+ */
+struct BestConfig {
+ // Big matrix size.
+ Index big_A_rows;
+ Index big_B_cols;
+ // The kernel used for the big matrix size.
+ Index kernel_A_rows;
+ Index kernel_B_cols;
+ Overhang overhang;
+ double recursive_time { std::numeric_limits<double>::infinity() };
+ double direct_time { std::numeric_limits<double>::infinity() };
+
+ void AbsorbIndirect(BestConfig other) {
+ if (other.recursive_time < recursive_time) {
+ double direct_save = direct_time;
+ *this = other;
+ direct_time = direct_save;
+ }
+ }
+
+ void Zero() {
+ big_A_rows = 0;
+ big_B_cols = 0;
+ kernel_A_rows = 0;
+ kernel_B_cols = 0;
+ overhang = NONE;
+ recursive_time = 0.0;
+ direct_time = 0.0;
+ }
+
+ char OverhangDescription() const {
+ switch (overhang) {
+ case BOTTOM:
+ return 'B';
+ case RIGHT:
+ return 'R';
+ case SEPARATE:
+ return 'S';
+ case NONE:
+ return 'N';
+ default:
+ std::cerr << "Bad overhang?" << std::endl;
+ return 'F';
+ }
+ }
+};
+
+template <Index A_rows, Index B_cols> static inline double BenchmarkNoOverhang(Accessor access, Tile shape) {
+ if ((shape.A_rows % A_rows) || (shape.B_cols % B_cols))
+ return std::numeric_limits<double>::infinity();
+ const std::size_t kTries = 20;
auto start = std::chrono::steady_clock::now();
+ typedef AVX512VNNI::UnrollKernel<A_rows, 1, B_cols, AVX512VNNI::Shifted8> Kernel;
// Burn in.
- AVX512VNNI::Multiply<Accessor, AVX512VNNI::Shifted8, A_rows, B_cols>(access, shape);
+ // TODO: different arches, guard against old compilers, etc.
+ AVX512VNNI::MultiplyNoOverhang<Accessor, Kernel>(access, shape);
for (std::size_t t = 0; t < kTries; ++t) {
- // TODO: try various multipliers, guard against old compilers, etc.
- AVX512VNNI::Multiply<Accessor, AVX512VNNI::Shifted8, A_rows, B_cols>(access, shape);
+ AVX512VNNI::MultiplyNoOverhang<Accessor, Kernel>(access, shape);
}
auto end = std::chrono::steady_clock::now();
- double took = std::chrono::duration<double>(end - start).count() / kTries;
- std::cout << std::setw(8) << std::setprecision(4) << took << ' ' << std::setw(2) << A_rows << 'x' << std::setw(2) << B_cols << std::endl;
+ return std::chrono::duration<double>(end - start).count() / kTries;
}
-template <std::size_t... Iterator> static inline void BenchmarkKernels(Tile shape, index_sequence<Iterator...>) {
- constexpr Index ColsMax = 16;
- TestMatrices8 matrices(shape);
+template <Index A_rows, Index B_cols> static void BenchmarkAndUpdate(Accessor access, Tile shape, BestConfig &out) {
+ double time = BenchmarkNoOverhang<A_rows, B_cols>(access, shape);
+ if (time < out.direct_time) {
+ out.big_A_rows = shape.A_rows;
+ out.big_B_cols = shape.B_cols;
+ out.kernel_A_rows = A_rows;
+ out.kernel_B_cols = B_cols;
+ out.direct_time = time;
+ out.overhang = NONE;
+ }
+}
+
+// Size of inner loop to sweep.
+constexpr Index kColsMax = 32;
+constexpr Index kRowsMax = 32;
+
+template <std::size_t... Iterator> static inline BestConfig BenchmarkKernels(TestMatrices8::AccessT accessor, Tile shape, index_sequence<Iterator...>) {
+ BestConfig ret;
using unfurl = int[];
- (void)unfurl{0, (
- BenchmarkOne<(Iterator / ColsMax) + 1, (Iterator % ColsMax) + 1>(matrices.Accessor(), shape)
- , 0)...};
+ // Could have used return values and built an array but the indices were annoying to handle.
+ (void)unfurl {0, (
+ BenchmarkAndUpdate<(Iterator / kColsMax) + 1, (Iterator % kColsMax) + 1>(accessor, shape, ret)
+ , 0)...
+ };
+ ret.recursive_time = ret.direct_time;
+ return ret;
}
+struct TileHash {
+ std::size_t operator()(const Tile &t) const noexcept {
+ std::hash<Index> hasher;
+ std::size_t seed = hasher(t.A_rows);
+ seed ^= hasher(t.inner) + 0x9e3779b9 + (seed<<6) + (seed>>2);
+ seed ^= hasher(t.B_cols) + 0x9e3779b9 + (seed<<6) + (seed>>2);
+ return seed;
+ }
+};
+
+class Memoise {
+ public:
+ explicit Memoise(Tile max_problem) :
+ matrices_(max_problem) {
+ // We'll just assume zero-size matrices take zero time.
+ zero_.Zero();
+ }
+
+ BestConfig Find(const Tile problem) {
+ BestConfig &best = Entry(problem);
+ const Index A_rows = problem.A_rows;
+ const Index inner = problem.inner;
+ const Index B_cols = problem.B_cols;
+ if (best.recursive_time != std::numeric_limits<double>::infinity()) return best;
+ // No overhang.
+ best = BenchmarkKernels(matrices_.Accessor(), problem, make_index_sequence<kColsMax * kRowsMax>());
+ for (Index row_overhang = 0; row_overhang < std::min(kRowsMax, A_rows); ++row_overhang) {
+ // Don't visit (0,0).
+ for (Index col_overhang = (row_overhang == 0 ? 1 : 0); col_overhang < std::min(kColsMax, B_cols); ++col_overhang) {
+ // This option does full recursion on breaking up big.
+ //BestConfig big = Find(A_rows - row_overhang, B_cols - col_overhang);
+ // This option assumes we run big directly.
+ BestConfig &big = Entry({A_rows - row_overhang, problem.inner, B_cols - col_overhang});
+ if (big.direct_time == std::numeric_limits<double>::infinity()) {
+ big = BenchmarkKernels(matrices_.Accessor(), Tile{A_rows - row_overhang, problem.inner, B_cols - col_overhang}, make_index_sequence<kColsMax * kRowsMax>());
+ }
+
+ BestConfig bottom_long = Find({row_overhang, inner, B_cols});
+ BestConfig bottom_short = Find({row_overhang, inner, B_cols - col_overhang});
+ BestConfig right_long = Find({A_rows, inner, col_overhang});
+ BestConfig right_short = Find({A_rows - row_overhang, inner, col_overhang});
+ BestConfig bottom_right = Find({row_overhang, inner, col_overhang});
+ BestConfig candidate;
+ candidate.big_A_rows = A_rows - row_overhang;
+ candidate.big_B_cols = B_cols - col_overhang;
+ // Record kernel from big tile
+ candidate.kernel_A_rows = big.kernel_A_rows;
+ candidate.kernel_B_cols = big.kernel_B_cols;
+
+ candidate.recursive_time = big.direct_time + bottom_long.recursive_time + right_short.recursive_time;
+ candidate.overhang = BOTTOM;
+ best.AbsorbIndirect(candidate);
+
+ candidate.recursive_time = big.direct_time + bottom_short.recursive_time + right_long.recursive_time;
+ candidate.overhang = RIGHT;
+ best.AbsorbIndirect(candidate);
+
+ candidate.recursive_time = big.direct_time + bottom_short.recursive_time + right_short.recursive_time + bottom_right.recursive_time;
+ candidate.overhang = SEPARATE;
+ best.AbsorbIndirect(candidate);
+ }
+ }
+ fprintf(stderr, "%8.3fus %8.3fus problem=%4zux%4zux%4zu big=%4zux%4zu bigkernel=%2zux%2zu overhang=%c\n", best.recursive_time * 1000000.0, best.direct_time * 1000000.0, problem.A_rows, problem.inner, problem.B_cols, best.big_A_rows, best.big_B_cols, best.kernel_A_rows, best.kernel_B_cols, best.OverhangDescription());
+ return best;
+ }
+
+ void Print(Tile problem, unsigned int nest) {
+ BestConfig best = Find(problem);
+ printf("%8.3fus %8.3fus problem=%4zux%4zux%4zu big=%4zux%4zu bigkernel=%2zux%2zu overhang=%c", best.recursive_time * 1000000.0, best.direct_time * 1000000.0, problem.A_rows, problem.inner, problem.B_cols, best.big_A_rows, best.big_B_cols, best.kernel_A_rows, best.kernel_B_cols, best.OverhangDescription());
+ for (unsigned int i = 0; i < nest; ++i) {
+ printf("*");
+ }
+ printf("\n");
+
+ if (best.overhang != NONE) {
+ Print({best.big_A_rows, problem.inner, best.big_B_cols}, nest + 1);
+ if (problem.A_rows != best.big_A_rows) {
+ Print({problem.A_rows - best.big_A_rows, problem.inner, best.overhang == BOTTOM ? problem.B_cols : best.big_B_cols}, nest + 1);
+ }
+ if (problem.B_cols != best.big_B_cols) {
+ Print({best.overhang == RIGHT ? problem.A_rows : best.big_A_rows, problem.inner, problem.B_cols - best.big_B_cols}, nest + 1);
+ }
+ if (best.overhang == SEPARATE) {
+ Print({problem.A_rows - best.big_A_rows, problem.inner, problem.B_cols - best.big_B_cols}, nest + 1);
+ }
+ }
+ }
+
+ private:
+ BestConfig &Entry(Tile size) {
+ // Nobody should modify zero because it is already valid.
+ if (size.A_rows == 0 || size.B_cols == 0) { return zero_; }
+ return table_[size];
+ }
+
+ std::unordered_map<Tile, BestConfig, TileHash> table_;
+
+ BestConfig zero_;
+
+ TestMatrices8 matrices_;
+};
+
} // namespace
} // namespace intgemm
int main() {
- intgemm::BenchmarkKernels({1024, 1024, 1024}, intgemm::make_index_sequence<16*16>());
+ intgemm::Tile shapes[] = {
+ {8, 256, 256},
+ {8, 2048, 256},
+ {8, 256, 2048},
+ {320, 256, 256},
+ {472, 256, 256},
+ {248, 256, 256},
+ {200, 256, 256},
+ // Additional stuff
+ {512, 512, 512},
+ {1024, 1024, 1024},
+ {64, 1024, 1024},
+ };
+ intgemm::Tile largest = {0,0,0};
+ for (const intgemm::Tile *i = shapes; i < shapes + sizeof(shapes) / sizeof(intgemm::Tile); ++i) {
+ largest.A_rows = std::max(largest.A_rows, i->A_rows);
+ largest.inner = std::max(largest.inner, i->inner);
+ largest.B_cols = std::max(largest.B_cols, i->B_cols);
+ }
+ intgemm::Memoise memo(largest);
+ for (const intgemm::Tile *i = shapes; i < shapes + sizeof(shapes) / sizeof(intgemm::Tile); ++i) {
+ memo.Print(*i, 0);
+ }
}
diff --git a/types.h b/types.h
index 9f40fcb..cfad855 100644
--- a/types.h
+++ b/types.h
@@ -52,6 +52,9 @@ extern const CPUType kCPU;
struct Tile {
Index A_rows, inner, B_cols;
constexpr bool empty() const { return !A_rows || !inner || !B_cols; }
+ constexpr bool operator==(const Tile other) const {
+ return A_rows == other.A_rows && inner == other.inner && B_cols == other.B_cols;
+ }
};
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI