diff options
author | Kenneth Heafield <github@kheafield.com> | 2020-04-25 12:41:49 +0300 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2020-04-25 12:41:49 +0300 |
commit | 7b5c9e642fa0f84e10167ab1f4a7850ae5496eb9 (patch) | |
tree | 8dab4c8d186cf8b7a4d3bbc6ded1ee2d37b04ca3 | |
parent | 6377ee4d9f051d7be0c9c290bb33ab66f27ea900 (diff) |
Memoising benchmark program to decide tile size, but it likes 1x16
-rw-r--r-- | benchmarks/benchmark_tile.cc | 246 | ||||
-rw-r--r-- | types.h | 3 |
2 files changed, 234 insertions, 15 deletions
diff --git a/benchmarks/benchmark_tile.cc b/benchmarks/benchmark_tile.cc index b7a5981..a528837 100644 --- a/benchmarks/benchmark_tile.cc +++ b/benchmarks/benchmark_tile.cc @@ -5,41 +5,257 @@ #include "../tile/dot.h" #include <chrono> -#include <iomanip> +#include <functional> +#include <limits> #include <random> #include <vector> +#include <stdio.h> + +#include <unordered_map> namespace intgemm { namespace { typedef TestMatrices8::AccessT Accessor; -template <Index A_rows, Index B_cols> static inline void BenchmarkOne(Accessor access, Tile shape) { - const std::size_t kTries = 4; +// What do we do with the overhang? Include in the bottom, include in right, or separate call? +enum Overhang { + BOTTOM, + RIGHT, + SEPARATE, + NONE +}; + +/* Entry in the memoisation table. It has three levels of validity: + * Blank: + * direct_time == std::numeric_limits<double>::infinity() + * recursive_time == std::numeric_limits<double>::infinity() + * Direct evaluated, but not splits: + * direct_time valid + * recursive_time == std::numeric_limits<double>::infinity() + * This is useful e.g. if the size is being considered for big evaluation. + * Full: + * direct_time valid + * recursive_time valid + It's been evaluated with splitting into big, right, bottom, and bottom right corner etc + */ +struct BestConfig { + // Big matrix size. + Index big_A_rows; + Index big_B_cols; + // The kernel used for the big matrix size. + Index kernel_A_rows; + Index kernel_B_cols; + Overhang overhang; + double recursive_time { std::numeric_limits<double>::infinity() }; + double direct_time { std::numeric_limits<double>::infinity() }; + + void AbsorbIndirect(BestConfig other) { + if (other.recursive_time < recursive_time) { + double direct_save = direct_time; + *this = other; + direct_time = direct_save; + } + } + + void Zero() { + big_A_rows = 0; + big_B_cols = 0; + kernel_A_rows = 0; + kernel_B_cols = 0; + overhang = NONE; + recursive_time = 0.0; + direct_time = 0.0; + } + + char OverhangDescription() const { + switch (overhang) { + case BOTTOM: + return 'B'; + case RIGHT: + return 'R'; + case SEPARATE: + return 'S'; + case NONE: + return 'N'; + default: + std::cerr << "Bad overhang?" << std::endl; + return 'F'; + } + } +}; + +template <Index A_rows, Index B_cols> static inline double BenchmarkNoOverhang(Accessor access, Tile shape) { + if ((shape.A_rows % A_rows) || (shape.B_cols % B_cols)) + return std::numeric_limits<double>::infinity(); + const std::size_t kTries = 20; auto start = std::chrono::steady_clock::now(); + typedef AVX512VNNI::UnrollKernel<A_rows, 1, B_cols, AVX512VNNI::Shifted8> Kernel; // Burn in. - AVX512VNNI::Multiply<Accessor, AVX512VNNI::Shifted8, A_rows, B_cols>(access, shape); + // TODO: different arches, guard against old compilers, etc. + AVX512VNNI::MultiplyNoOverhang<Accessor, Kernel>(access, shape); for (std::size_t t = 0; t < kTries; ++t) { - // TODO: try various multipliers, guard against old compilers, etc. - AVX512VNNI::Multiply<Accessor, AVX512VNNI::Shifted8, A_rows, B_cols>(access, shape); + AVX512VNNI::MultiplyNoOverhang<Accessor, Kernel>(access, shape); } auto end = std::chrono::steady_clock::now(); - double took = std::chrono::duration<double>(end - start).count() / kTries; - std::cout << std::setw(8) << std::setprecision(4) << took << ' ' << std::setw(2) << A_rows << 'x' << std::setw(2) << B_cols << std::endl; + return std::chrono::duration<double>(end - start).count() / kTries; } -template <std::size_t... Iterator> static inline void BenchmarkKernels(Tile shape, index_sequence<Iterator...>) { - constexpr Index ColsMax = 16; - TestMatrices8 matrices(shape); +template <Index A_rows, Index B_cols> static void BenchmarkAndUpdate(Accessor access, Tile shape, BestConfig &out) { + double time = BenchmarkNoOverhang<A_rows, B_cols>(access, shape); + if (time < out.direct_time) { + out.big_A_rows = shape.A_rows; + out.big_B_cols = shape.B_cols; + out.kernel_A_rows = A_rows; + out.kernel_B_cols = B_cols; + out.direct_time = time; + out.overhang = NONE; + } +} + +// Size of inner loop to sweep. +constexpr Index kColsMax = 32; +constexpr Index kRowsMax = 32; + +template <std::size_t... Iterator> static inline BestConfig BenchmarkKernels(TestMatrices8::AccessT accessor, Tile shape, index_sequence<Iterator...>) { + BestConfig ret; using unfurl = int[]; - (void)unfurl{0, ( - BenchmarkOne<(Iterator / ColsMax) + 1, (Iterator % ColsMax) + 1>(matrices.Accessor(), shape) - , 0)...}; + // Could have used return values and built an array but the indices were annoying to handle. + (void)unfurl {0, ( + BenchmarkAndUpdate<(Iterator / kColsMax) + 1, (Iterator % kColsMax) + 1>(accessor, shape, ret) + , 0)... + }; + ret.recursive_time = ret.direct_time; + return ret; } +struct TileHash { + std::size_t operator()(const Tile &t) const noexcept { + std::hash<Index> hasher; + std::size_t seed = hasher(t.A_rows); + seed ^= hasher(t.inner) + 0x9e3779b9 + (seed<<6) + (seed>>2); + seed ^= hasher(t.B_cols) + 0x9e3779b9 + (seed<<6) + (seed>>2); + return seed; + } +}; + +class Memoise { + public: + explicit Memoise(Tile max_problem) : + matrices_(max_problem) { + // We'll just assume zero-size matrices take zero time. + zero_.Zero(); + } + + BestConfig Find(const Tile problem) { + BestConfig &best = Entry(problem); + const Index A_rows = problem.A_rows; + const Index inner = problem.inner; + const Index B_cols = problem.B_cols; + if (best.recursive_time != std::numeric_limits<double>::infinity()) return best; + // No overhang. + best = BenchmarkKernels(matrices_.Accessor(), problem, make_index_sequence<kColsMax * kRowsMax>()); + for (Index row_overhang = 0; row_overhang < std::min(kRowsMax, A_rows); ++row_overhang) { + // Don't visit (0,0). + for (Index col_overhang = (row_overhang == 0 ? 1 : 0); col_overhang < std::min(kColsMax, B_cols); ++col_overhang) { + // This option does full recursion on breaking up big. + //BestConfig big = Find(A_rows - row_overhang, B_cols - col_overhang); + // This option assumes we run big directly. + BestConfig &big = Entry({A_rows - row_overhang, problem.inner, B_cols - col_overhang}); + if (big.direct_time == std::numeric_limits<double>::infinity()) { + big = BenchmarkKernels(matrices_.Accessor(), Tile{A_rows - row_overhang, problem.inner, B_cols - col_overhang}, make_index_sequence<kColsMax * kRowsMax>()); + } + + BestConfig bottom_long = Find({row_overhang, inner, B_cols}); + BestConfig bottom_short = Find({row_overhang, inner, B_cols - col_overhang}); + BestConfig right_long = Find({A_rows, inner, col_overhang}); + BestConfig right_short = Find({A_rows - row_overhang, inner, col_overhang}); + BestConfig bottom_right = Find({row_overhang, inner, col_overhang}); + BestConfig candidate; + candidate.big_A_rows = A_rows - row_overhang; + candidate.big_B_cols = B_cols - col_overhang; + // Record kernel from big tile + candidate.kernel_A_rows = big.kernel_A_rows; + candidate.kernel_B_cols = big.kernel_B_cols; + + candidate.recursive_time = big.direct_time + bottom_long.recursive_time + right_short.recursive_time; + candidate.overhang = BOTTOM; + best.AbsorbIndirect(candidate); + + candidate.recursive_time = big.direct_time + bottom_short.recursive_time + right_long.recursive_time; + candidate.overhang = RIGHT; + best.AbsorbIndirect(candidate); + + candidate.recursive_time = big.direct_time + bottom_short.recursive_time + right_short.recursive_time + bottom_right.recursive_time; + candidate.overhang = SEPARATE; + best.AbsorbIndirect(candidate); + } + } + fprintf(stderr, "%8.3fus %8.3fus problem=%4zux%4zux%4zu big=%4zux%4zu bigkernel=%2zux%2zu overhang=%c\n", best.recursive_time * 1000000.0, best.direct_time * 1000000.0, problem.A_rows, problem.inner, problem.B_cols, best.big_A_rows, best.big_B_cols, best.kernel_A_rows, best.kernel_B_cols, best.OverhangDescription()); + return best; + } + + void Print(Tile problem, unsigned int nest) { + BestConfig best = Find(problem); + printf("%8.3fus %8.3fus problem=%4zux%4zux%4zu big=%4zux%4zu bigkernel=%2zux%2zu overhang=%c", best.recursive_time * 1000000.0, best.direct_time * 1000000.0, problem.A_rows, problem.inner, problem.B_cols, best.big_A_rows, best.big_B_cols, best.kernel_A_rows, best.kernel_B_cols, best.OverhangDescription()); + for (unsigned int i = 0; i < nest; ++i) { + printf("*"); + } + printf("\n"); + + if (best.overhang != NONE) { + Print({best.big_A_rows, problem.inner, best.big_B_cols}, nest + 1); + if (problem.A_rows != best.big_A_rows) { + Print({problem.A_rows - best.big_A_rows, problem.inner, best.overhang == BOTTOM ? problem.B_cols : best.big_B_cols}, nest + 1); + } + if (problem.B_cols != best.big_B_cols) { + Print({best.overhang == RIGHT ? problem.A_rows : best.big_A_rows, problem.inner, problem.B_cols - best.big_B_cols}, nest + 1); + } + if (best.overhang == SEPARATE) { + Print({problem.A_rows - best.big_A_rows, problem.inner, problem.B_cols - best.big_B_cols}, nest + 1); + } + } + } + + private: + BestConfig &Entry(Tile size) { + // Nobody should modify zero because it is already valid. + if (size.A_rows == 0 || size.B_cols == 0) { return zero_; } + return table_[size]; + } + + std::unordered_map<Tile, BestConfig, TileHash> table_; + + BestConfig zero_; + + TestMatrices8 matrices_; +}; + } // namespace } // namespace intgemm int main() { - intgemm::BenchmarkKernels({1024, 1024, 1024}, intgemm::make_index_sequence<16*16>()); + intgemm::Tile shapes[] = { + {8, 256, 256}, + {8, 2048, 256}, + {8, 256, 2048}, + {320, 256, 256}, + {472, 256, 256}, + {248, 256, 256}, + {200, 256, 256}, + // Additional stuff + {512, 512, 512}, + {1024, 1024, 1024}, + {64, 1024, 1024}, + }; + intgemm::Tile largest = {0,0,0}; + for (const intgemm::Tile *i = shapes; i < shapes + sizeof(shapes) / sizeof(intgemm::Tile); ++i) { + largest.A_rows = std::max(largest.A_rows, i->A_rows); + largest.inner = std::max(largest.inner, i->inner); + largest.B_cols = std::max(largest.B_cols, i->B_cols); + } + intgemm::Memoise memo(largest); + for (const intgemm::Tile *i = shapes; i < shapes + sizeof(shapes) / sizeof(intgemm::Tile); ++i) { + memo.Print(*i, 0); + } } @@ -52,6 +52,9 @@ extern const CPUType kCPU; struct Tile { Index A_rows, inner, B_cols; constexpr bool empty() const { return !A_rows || !inner || !B_cols; } + constexpr bool operator==(const Tile other) const { + return A_rows == other.A_rows && inner == other.inner && B_cols == other.B_cols; + } }; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI |