diff options
-rw-r--r-- | cmake/FindMKL.cmake | 10 | ||||
-rwxr-xr-x | src/common/definitions.h | 1 | ||||
-rw-r--r-- | src/graph/auto_tuner.h | 17 | ||||
-rwxr-xr-x | src/graph/expression_operators.cpp | 128 | ||||
-rw-r--r-- | src/microsoft/quicksand.cpp | 2 | ||||
-rwxr-xr-x | src/models/transformer_stub.cpp | 2 | ||||
-rw-r--r-- | src/tensors/cpu/backend.h | 4 | ||||
-rw-r--r-- | src/tensors/cpu/expanded_gemm.h | 2 |
8 files changed, 95 insertions, 71 deletions
diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake index 028161e3..a484e9ab 100644 --- a/cmake/FindMKL.cmake +++ b/cmake/FindMKL.cmake @@ -53,11 +53,11 @@ else() set(COR_LIB "mkl_core") endif() -if(MSVC)
- set(ProgramFilesx86 "ProgramFiles(x86)")
- set(INTEL_ROOT_DEFAULT $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows)
-else()
- set(INTEL_ROOT_DEFAULT "/opt/intel")
+if(MSVC) + set(ProgramFilesx86 "ProgramFiles(x86)") + set(INTEL_ROOT_DEFAULT $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows) +else() + set(INTEL_ROOT_DEFAULT "/opt/intel") endif() set(INTEL_ROOT ${INTEL_ROOT_DEFAULT} CACHE PATH "Folder contains intel libs") find_path(MKL_ROOT include/mkl.h PATHS $ENV{MKLROOT} ${INTEL_ROOT}/mkl diff --git a/src/common/definitions.h b/src/common/definitions.h index ea14a8e5..f46da234 100755 --- a/src/common/definitions.h +++ b/src/common/definitions.h @@ -116,5 +116,4 @@ typedef Ptr<RunBase> RunBasePtr; const float NEMATUS_LN_EPS = 1e-5f; - } // namespace marian diff --git a/src/graph/auto_tuner.h b/src/graph/auto_tuner.h index 868a800e..6a054332 100644 --- a/src/graph/auto_tuner.h +++ b/src/graph/auto_tuner.h @@ -20,15 +20,26 @@ class AutoTuner : public AutoTunerRecorder { private: typedef std::function<Return(Args...)> Algorithm; - const size_t max = 50; + // When the autotuner decides the fastest algorithm for a specific tensor operation (e.g. GEMM), + // the autotuner runs each algorithm at least this 'collectStatMax' number of times and + // collects the statistics. + const size_t collectStatMax = 50; UPtr<timer::CPUTimer> timer_; + // This structure holds a hash key an algorithm function (e.g. int16, packed gemm, mkl gemm) + // for a specific operation size + // hash: a unique hash key for each operation size + // (e.g. m, n, k, transpose A, transpose B, bias size for GEMM) + // algorithm: a function that holds an algorithm struct HashedAlgorithm { size_t hash; Algorithm algorithm; }; + // This structure represents the collected statistics. + // time: total accumulated time of this operator execution with the given algorithm + // runs: total time this algorithm was executed struct Stat { double time; size_t runs; @@ -53,7 +64,7 @@ private: auto& stat = it->second; // collect more stats - if(stat.runs < max) + if(stat.runs < collectStatMax) return i; if(stat.time < bestTime) { @@ -93,7 +104,7 @@ public: auto it = stats_.find(hash); if(it != stats_.end()) { - if(it->second.runs < max) { + if(it->second.runs < collectStatMax) { it->second.time += seconds.count(); it->second.runs += 1; } diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 56fb654a..0f48d0b2 100755 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -413,6 +413,10 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { if(a->graph()->getBackend()->isOptimized() && device == DeviceType::cpu) { GemmType gemmType = a->graph()->getBackend()->getGemmType(); + // When gemmType is set to 'auto', an autotuner decides the best algorithm available. + // A new autotuner is created, then different kinds of algorithms are added to the autotuner. + // For each GEMM size, there is a unique hash key. + // (e.g. m, n, k, transpose A, transpose B, bias size for GEMM) if(gemmType == GemmType::Auto) { thread_local Ptr<AutoTuner<Expr>> tuner = New<AutoTuner<Expr>>(); @@ -434,76 +438,87 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { util::hash_combine(hash, transB); #if USE_FBGEMM - // Use Packed GEMM only if it's memoized + // Use Packed GEMM only if the node b in the graph is memoized. + // More specifically, packed GEMM is used only if the B matrix (weight) is constant. + // In general, 'memoized' means that the node is a constant variable or + // a combination of contant nodes which is also a constant variable + // when it's computed once. + // Those memoized nodes are cached to avoid duplicated computations. if(b->memoize()) { - // add third algorithm variant (Packed GEMM) - size_t hash1 = hash; - util::hash_combine(hash1, 1); - auto rec1 = [=](Expr e, bool stop = false) { - e->record(tuner, hash1, stop); + // add packed GEMM algorithm variant (Packed GEMM) to the autotuner + // Once an algorithm is added to the autotuner, + // autotuner runs all the added algorithms for a designated times. + // One algorithm is run per one this operation call + // and the stat for that algorithm is collected. + // When all the algorithms reach the maximum stat collection count, + // the autotuner decide the best algorithm, and keep using it afterward. + size_t hashPack = hash; + util::hash_combine(hashPack, 1); + auto recPack = [=](Expr e, bool stop = false) { + e->record(tuner, hashPack, stop); return e; }; - auto alg1 = [=]() { + auto algPack = [=]() { auto packed = cpu::variant::pack(b, cpu::variant::PackMatrix::B, transB, clipValue); - return rec1( - cpu::variant::affine( - clip(a, clipValue), packed, - b->shape(), - bias, - transA, - transB, - scale), + return recPack( + cpu::variant::affine(clip(a, clipValue), + packed, + b->shape(), + bias, + transA, + transB, + scale), true); }; - tuner->insert({hash1, alg1}); + tuner->insert({hashPack, algPack}); } #endif // USE_FBGEMM - // add first algorithm variant (Int16) - size_t hash2 = hash; - util::hash_combine(hash2, 2); - auto rec2 = [=](Expr e, bool stop = false) { - e->record(tuner, hash2, stop); + // add second algorithm variant (Int16) to the autotuner + size_t hashInt16 = hash; + util::hash_combine(hashInt16, 2); + auto recInt16 = [=](Expr e, bool stop = false) { + e->record(tuner, hashInt16, stop); return e; }; - auto alg2 = [=]() { - return rec2( - cpu::int16::affine( - rec2(cpu::int16::quantize(transA ? rec2(transpose(a)) : a, - clipValue)), - cpu::int16::quantize(transB ? b : transpose(b), clipValue), - bias, - scale), - true); + auto algInt16 = [=]() { + return recInt16( + cpu::int16::affine( + recInt16(cpu::int16::quantize(transA ? recInt16(transpose(a)) : a, + clipValue)), + cpu::int16::quantize(transB ? b : transpose(b), clipValue), + bias, + scale), + true); }; - tuner->insert({hash2, alg2}); + tuner->insert({hashInt16, algInt16}); - // add second algorithm variant (CBlas) - size_t hash3 = hash; - util::hash_combine(hash3, 3); - auto rec3 = [=](Expr e, bool stop = false) { - e->record(tuner, hash3, stop); + // add third algorithm variant (CBlas) to the autotuner + size_t hashCblas = hash; + util::hash_combine(hashCblas, 3); + auto recCblas = [=](Expr e, bool stop = false) { + e->record(tuner, hashCblas, stop); return e; }; - auto alg3 = [=]() { + auto algCblas = [=]() { auto ac = clip(a, clipValue); if(ac != a) - ac = rec3(ac); + ac = recCblas(ac); auto bc = clip(b, clipValue); if(bc != b) - bc = rec3(bc); + bc = recCblas(bc); int rows = ac->shape().elements() / ac->shape()[-1]; Expr ones = ac->graph()->ones({rows, 1}); std::vector<Expr> nodes = {ac, bc, bias, ones}; - return rec3(Expression<AffineNodeOp>(nodes, transA, transB, scale), + return recCblas(Expression<AffineNodeOp>(nodes, transA, transB, scale), true); }; - tuner->insert({hash3, alg3}); + tuner->insert({hashCblas, algCblas}); // execute algorithm with autotuning return tuner->run(); @@ -512,25 +527,22 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { if(gemmType == GemmType::IntrinInt16) { // cpu int16 version return cpu::int16::affine( - cpu::int16::quantize(transA ? transpose(a) : a, clipValue), - cpu::int16::quantize(transB ? b : transpose(b), clipValue), - bias, - scale); + cpu::int16::quantize(transA ? transpose(a) : a, clipValue), + cpu::int16::quantize(transB ? b : transpose(b), clipValue), + bias, + scale); } else if(gemmType == GemmType::FbFp16Packed) { #if USE_FBGEMM if(b->memoize()) { auto packed = cpu::variant::pack(b, cpu::variant::PackMatrix::B, transB, clipValue); - // auto packed = transB ? - // cpu::variant::pack(transpose(b), cpu::pack::PackMatrix::B, false, clipValue) : - // cpu::variant::pack(b, cpu::pack::PackMatrix::B, false, clipValue); - - return cpu::variant::affine( - clip(a, clipValue), packed, - b->shape(), - bias, - transA, - transB, - scale); + + return cpu::variant::affine(clip(a, clipValue), + packed, + b->shape(), + bias, + transA, + transB, + scale); } else { int rows = a->shape().elements() / a->shape()[-1]; Expr ones = a->graph()->ones({rows, 1}); @@ -538,7 +550,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { return Expression<AffineNodeOp>(nodes, transA, transB, scale); } #else - ABORT("Packed GEMM not implemented"); + ABORT("Packed GEMM is not available in this build"); #endif // USE_FBGEMM } else if(gemmType == GemmType::MklFp32) { @@ -555,7 +567,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { = {clip(a, clipValue), clip(b, clipValue), bias, ones}; return Expression<AffineNodeOp>(nodes, transA, transB, scale); } else { - ABORT("Not implemented"); + ABORT("GemmType..{} not available by affine()", gemmType); } } } else { diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 5809c18c..d46273b0 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -72,7 +72,7 @@ public: // Use packed GEMM for the production graph_->getBackend()->setOptimized(true); - graph_->getBackend()->setGemmType("packed"); + graph_->getBackend()->setGemmType("fp16packed"); #ifdef MKL_FOUND mkl_set_num_threads(options->get<int>("mkl-threads", 1)); diff --git a/src/models/transformer_stub.cpp b/src/models/transformer_stub.cpp index 2e8d694f..871ee009 100755 --- a/src/models/transformer_stub.cpp +++ b/src/models/transformer_stub.cpp @@ -1,4 +1,4 @@ -#include "models/transformer.h"
+#include "models/transformer.h" namespace marian { // factory functions diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index 094e9331..8d4adcfc 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -27,9 +27,11 @@ public: if (gemmType == "auto") gemmType_ = GemmType::Auto; else if (gemmType == "mklfp32") gemmType_ = GemmType::MklFp32; else if (gemmType == "intrinint16") gemmType_ = GemmType::IntrinInt16; +#if USE_FBGEMM else if (gemmType == "fp16packed") gemmType_ = GemmType::FbFp16Packed; else if (gemmType == "int8packed") gemmType_ = GemmType::FbInt8Packed; - else ABORT("Unknown GEMM type"); +#endif // USE_FBGEMM + else ABORT("Unknown GEMM type - '{}'", gemmType); } GemmType getGemmType() override { return gemmType_; } }; diff --git a/src/tensors/cpu/expanded_gemm.h b/src/tensors/cpu/expanded_gemm.h index 03e58ae4..8ccdb415 100644 --- a/src/tensors/cpu/expanded_gemm.h +++ b/src/tensors/cpu/expanded_gemm.h @@ -157,7 +157,7 @@ public: return {NodeOp(0)}; } - const std::string type() override { return "affinePacked"; } + const std::string type() override { return "fp16packed"; } }; static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) { |