diff options
author | Young Jin Kim <youki@microsoft.com> | 2019-06-19 02:59:11 +0300 |
---|---|---|
committer | Young Jin Kim <youki@microsoft.com> | 2019-06-19 02:59:11 +0300 |
commit | d2d8ec041d443f906809b912a57a3d41b2c3a07c (patch) | |
tree | 6ec098259c9731bb42bf93ad8b8520daece8cae6 /src/graph | |
parent | 37808eaae3eaaf344857dae51e6e961dac569d04 (diff) |
Enable FBGEMM based packed GEMM on windows
Diffstat (limited to 'src/graph')
-rw-r--r-- | src/graph/auto_tuner.h | 2 | ||||
-rwxr-xr-x | src/graph/expression_operators.cpp | 96 |
2 files changed, 73 insertions, 25 deletions
diff --git a/src/graph/auto_tuner.h b/src/graph/auto_tuner.h index 7bf80c79..868a800e 100644 --- a/src/graph/auto_tuner.h +++ b/src/graph/auto_tuner.h @@ -20,7 +20,7 @@ class AutoTuner : public AutoTunerRecorder { private: typedef std::function<Return(Args...)> Algorithm; - const size_t max = 100; + const size_t max = 50; UPtr<timer::CPUTimer> timer_; diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 396fa03e..42bb65e2 100755 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -7,6 +7,7 @@ #include "graph/auto_tuner.h" #include "tensors/cpu/int16.h" +#include "tensors/cpu/expanded_gemm.h" namespace marian { @@ -410,7 +411,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { float clipValue = a->graph()->getBackend()->getClip(); if(a->graph()->isOptimized() && device == DeviceType::cpu) { - bool autotune = true; + bool autotune = false; if(autotune) { thread_local Ptr<AutoTuner<Expr>> tuner = New<AutoTuner<Expr>>(); @@ -431,60 +432,107 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { util::hash_combine(hash, transA); util::hash_combine(hash, transB); +#if USE_FBGEMM + // Use Packed GEMM only if it's memoized + if(b->memoize()) { + // add third algorithm variant (Packed GEMM) + size_t hash1 = hash; + util::hash_combine(hash1, 1); + auto rec1 = [=](Expr e, bool stop = false) { + e->record(tuner, hash1, stop); + return e; + }; + + auto alg1 = [=]() { + auto packed = cpu::pack::pack(b, cpu::pack::PackMatrix::B, transB, clipValue); + + return rec1( + cpu::pack::affine( + clip(a, clipValue), packed, + b->shape(), + bias, + transA, + transB, + scale), + true); + }; + tuner->insert({hash1, alg1}); + } +#endif // USE_FBGEMM + // add first algorithm variant (Int16) - size_t hash1 = hash; - util::hash_combine(hash1, 1); - auto rec1 = [=](Expr e, bool stop = false) { - e->record(tuner, hash1, stop); + size_t hash2 = hash; + util::hash_combine(hash2, 2); + auto rec2 = [=](Expr e, bool stop = false) { + e->record(tuner, hash2, stop); return e; }; - auto alg1 = [=]() { - return rec1( + auto alg2 = [=]() { + return rec2( cpu::int16::affine( - rec1(cpu::int16::quantize(transA ? rec1(transpose(a)) : a, + rec2(cpu::int16::quantize(transA ? rec2(transpose(a)) : a, clipValue)), cpu::int16::quantize(transB ? b : transpose(b), clipValue), bias, scale), true); }; - tuner->insert({hash1, alg1}); + tuner->insert({hash2, alg2}); // add second algorithm variant (CBlas) - size_t hash2 = hash; - util::hash_combine(hash2, 2); - auto rec2 = [=](Expr e, bool stop = false) { - e->record(tuner, hash2, stop); + size_t hash3 = hash; + util::hash_combine(hash3, 3); + auto rec3 = [=](Expr e, bool stop = false) { + e->record(tuner, hash3, stop); return e; }; - auto alg2 = [=]() { + auto alg3 = [=]() { auto ac = clip(a, clipValue); if(ac != a) - ac = rec2(ac); + ac = rec3(ac); auto bc = clip(b, clipValue); if(bc != b) - bc = rec2(bc); + bc = rec3(bc); int rows = ac->shape().elements() / ac->shape()[-1]; Expr ones = ac->graph()->ones({rows, 1}); std::vector<Expr> nodes = {ac, bc, bias, ones}; - return rec2(Expression<AffineNodeOp>(nodes, transA, transB, scale), + return rec3(Expression<AffineNodeOp>(nodes, transA, transB, scale), true); }; - tuner->insert({hash2, alg2}); + tuner->insert({hash3, alg3}); // execute algorithm with autotuning return tuner->run(); } else { - // cpu int16 version - return cpu::int16::affine( - cpu::int16::quantize(transA ? transpose(a) : a, clipValue), - cpu::int16::quantize(transB ? b : transpose(b), clipValue), - bias, - scale); + if(b->memoize()) { + auto packed = cpu::pack::pack(b, cpu::pack::PackMatrix::B, transB, clipValue); + // auto packed = transB ? + // cpu::pack::pack(transpose(b), cpu::pack::PackMatrix::B, false, clipValue) : + // cpu::pack::pack(b, cpu::pack::PackMatrix::B, false, clipValue); + + return cpu::pack::affine( + clip(a, clipValue), packed, + b->shape(), + bias, + transA, + transB, + scale); + // cpu int16 version + // return cpu::int16::affine( + // cpu::int16::quantize(transA ? transpose(a) : a, clipValue), + // cpu::int16::quantize(transB ? b : transpose(b), clipValue), + // bias, + // scale); + } else { + int rows = a->shape().elements() / a->shape()[-1]; + Expr ones = a->graph()->ones({rows, 1}); + std::vector<Expr> nodes = {clip(a, clipValue), clip(b, clipValue), bias, ones}; + return Expression<AffineNodeOp>(nodes, transA, transB, scale); + } } } else { // general version, MKL, CBlas or CUDA |