Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmake/FindMKL.cmake10
-rwxr-xr-xsrc/common/definitions.h1
-rw-r--r--src/graph/auto_tuner.h17
-rwxr-xr-xsrc/graph/expression_operators.cpp128
-rw-r--r--src/microsoft/quicksand.cpp2
-rwxr-xr-xsrc/models/transformer_stub.cpp2
-rw-r--r--src/tensors/cpu/backend.h4
-rw-r--r--src/tensors/cpu/expanded_gemm.h2
8 files changed, 95 insertions, 71 deletions
diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake
index 028161e3..a484e9ab 100644
--- a/cmake/FindMKL.cmake
+++ b/cmake/FindMKL.cmake
@@ -53,11 +53,11 @@ else()
set(COR_LIB "mkl_core")
endif()
-if(MSVC)
- set(ProgramFilesx86 "ProgramFiles(x86)")
- set(INTEL_ROOT_DEFAULT $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows)
-else()
- set(INTEL_ROOT_DEFAULT "/opt/intel")
+if(MSVC)
+ set(ProgramFilesx86 "ProgramFiles(x86)")
+ set(INTEL_ROOT_DEFAULT $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows)
+else()
+ set(INTEL_ROOT_DEFAULT "/opt/intel")
endif()
set(INTEL_ROOT ${INTEL_ROOT_DEFAULT} CACHE PATH "Folder contains intel libs")
find_path(MKL_ROOT include/mkl.h PATHS $ENV{MKLROOT} ${INTEL_ROOT}/mkl
diff --git a/src/common/definitions.h b/src/common/definitions.h
index ea14a8e5..f46da234 100755
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -116,5 +116,4 @@ typedef Ptr<RunBase> RunBasePtr;
const float NEMATUS_LN_EPS = 1e-5f;
-
} // namespace marian
diff --git a/src/graph/auto_tuner.h b/src/graph/auto_tuner.h
index 868a800e..6a054332 100644
--- a/src/graph/auto_tuner.h
+++ b/src/graph/auto_tuner.h
@@ -20,15 +20,26 @@ class AutoTuner : public AutoTunerRecorder {
private:
typedef std::function<Return(Args...)> Algorithm;
- const size_t max = 50;
+ // When the autotuner decides the fastest algorithm for a specific tensor operation (e.g. GEMM),
+ // the autotuner runs each algorithm at least this 'collectStatMax' number of times and
+ // collects the statistics.
+ const size_t collectStatMax = 50;
UPtr<timer::CPUTimer> timer_;
+ // This structure holds a hash key an algorithm function (e.g. int16, packed gemm, mkl gemm)
+ // for a specific operation size
+ // hash: a unique hash key for each operation size
+ // (e.g. m, n, k, transpose A, transpose B, bias size for GEMM)
+ // algorithm: a function that holds an algorithm
struct HashedAlgorithm {
size_t hash;
Algorithm algorithm;
};
+ // This structure represents the collected statistics.
+ // time: total accumulated time of this operator execution with the given algorithm
+ // runs: total time this algorithm was executed
struct Stat {
double time;
size_t runs;
@@ -53,7 +64,7 @@ private:
auto& stat = it->second;
// collect more stats
- if(stat.runs < max)
+ if(stat.runs < collectStatMax)
return i;
if(stat.time < bestTime) {
@@ -93,7 +104,7 @@ public:
auto it = stats_.find(hash);
if(it != stats_.end()) {
- if(it->second.runs < max) {
+ if(it->second.runs < collectStatMax) {
it->second.time += seconds.count();
it->second.runs += 1;
}
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 56fb654a..0f48d0b2 100755
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -413,6 +413,10 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
if(a->graph()->getBackend()->isOptimized() && device == DeviceType::cpu) {
GemmType gemmType = a->graph()->getBackend()->getGemmType();
+ // When gemmType is set to 'auto', an autotuner decides the best algorithm available.
+ // A new autotuner is created, then different kinds of algorithms are added to the autotuner.
+ // For each GEMM size, there is a unique hash key.
+ // (e.g. m, n, k, transpose A, transpose B, bias size for GEMM)
if(gemmType == GemmType::Auto) {
thread_local Ptr<AutoTuner<Expr>> tuner = New<AutoTuner<Expr>>();
@@ -434,76 +438,87 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
util::hash_combine(hash, transB);
#if USE_FBGEMM
- // Use Packed GEMM only if it's memoized
+ // Use Packed GEMM only if the node b in the graph is memoized.
+ // More specifically, packed GEMM is used only if the B matrix (weight) is constant.
+ // In general, 'memoized' means that the node is a constant variable or
+ // a combination of contant nodes which is also a constant variable
+ // when it's computed once.
+ // Those memoized nodes are cached to avoid duplicated computations.
if(b->memoize()) {
- // add third algorithm variant (Packed GEMM)
- size_t hash1 = hash;
- util::hash_combine(hash1, 1);
- auto rec1 = [=](Expr e, bool stop = false) {
- e->record(tuner, hash1, stop);
+ // add packed GEMM algorithm variant (Packed GEMM) to the autotuner
+ // Once an algorithm is added to the autotuner,
+ // autotuner runs all the added algorithms for a designated times.
+ // One algorithm is run per one this operation call
+ // and the stat for that algorithm is collected.
+ // When all the algorithms reach the maximum stat collection count,
+ // the autotuner decide the best algorithm, and keep using it afterward.
+ size_t hashPack = hash;
+ util::hash_combine(hashPack, 1);
+ auto recPack = [=](Expr e, bool stop = false) {
+ e->record(tuner, hashPack, stop);
return e;
};
- auto alg1 = [=]() {
+ auto algPack = [=]() {
auto packed = cpu::variant::pack(b, cpu::variant::PackMatrix::B, transB, clipValue);
- return rec1(
- cpu::variant::affine(
- clip(a, clipValue), packed,
- b->shape(),
- bias,
- transA,
- transB,
- scale),
+ return recPack(
+ cpu::variant::affine(clip(a, clipValue),
+ packed,
+ b->shape(),
+ bias,
+ transA,
+ transB,
+ scale),
true);
};
- tuner->insert({hash1, alg1});
+ tuner->insert({hashPack, algPack});
}
#endif // USE_FBGEMM
- // add first algorithm variant (Int16)
- size_t hash2 = hash;
- util::hash_combine(hash2, 2);
- auto rec2 = [=](Expr e, bool stop = false) {
- e->record(tuner, hash2, stop);
+ // add second algorithm variant (Int16) to the autotuner
+ size_t hashInt16 = hash;
+ util::hash_combine(hashInt16, 2);
+ auto recInt16 = [=](Expr e, bool stop = false) {
+ e->record(tuner, hashInt16, stop);
return e;
};
- auto alg2 = [=]() {
- return rec2(
- cpu::int16::affine(
- rec2(cpu::int16::quantize(transA ? rec2(transpose(a)) : a,
- clipValue)),
- cpu::int16::quantize(transB ? b : transpose(b), clipValue),
- bias,
- scale),
- true);
+ auto algInt16 = [=]() {
+ return recInt16(
+ cpu::int16::affine(
+ recInt16(cpu::int16::quantize(transA ? recInt16(transpose(a)) : a,
+ clipValue)),
+ cpu::int16::quantize(transB ? b : transpose(b), clipValue),
+ bias,
+ scale),
+ true);
};
- tuner->insert({hash2, alg2});
+ tuner->insert({hashInt16, algInt16});
- // add second algorithm variant (CBlas)
- size_t hash3 = hash;
- util::hash_combine(hash3, 3);
- auto rec3 = [=](Expr e, bool stop = false) {
- e->record(tuner, hash3, stop);
+ // add third algorithm variant (CBlas) to the autotuner
+ size_t hashCblas = hash;
+ util::hash_combine(hashCblas, 3);
+ auto recCblas = [=](Expr e, bool stop = false) {
+ e->record(tuner, hashCblas, stop);
return e;
};
- auto alg3 = [=]() {
+ auto algCblas = [=]() {
auto ac = clip(a, clipValue);
if(ac != a)
- ac = rec3(ac);
+ ac = recCblas(ac);
auto bc = clip(b, clipValue);
if(bc != b)
- bc = rec3(bc);
+ bc = recCblas(bc);
int rows = ac->shape().elements() / ac->shape()[-1];
Expr ones = ac->graph()->ones({rows, 1});
std::vector<Expr> nodes = {ac, bc, bias, ones};
- return rec3(Expression<AffineNodeOp>(nodes, transA, transB, scale),
+ return recCblas(Expression<AffineNodeOp>(nodes, transA, transB, scale),
true);
};
- tuner->insert({hash3, alg3});
+ tuner->insert({hashCblas, algCblas});
// execute algorithm with autotuning
return tuner->run();
@@ -512,25 +527,22 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
if(gemmType == GemmType::IntrinInt16) {
// cpu int16 version
return cpu::int16::affine(
- cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
- cpu::int16::quantize(transB ? b : transpose(b), clipValue),
- bias,
- scale);
+ cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
+ cpu::int16::quantize(transB ? b : transpose(b), clipValue),
+ bias,
+ scale);
} else if(gemmType == GemmType::FbFp16Packed) {
#if USE_FBGEMM
if(b->memoize()) {
auto packed = cpu::variant::pack(b, cpu::variant::PackMatrix::B, transB, clipValue);
- // auto packed = transB ?
- // cpu::variant::pack(transpose(b), cpu::pack::PackMatrix::B, false, clipValue) :
- // cpu::variant::pack(b, cpu::pack::PackMatrix::B, false, clipValue);
-
- return cpu::variant::affine(
- clip(a, clipValue), packed,
- b->shape(),
- bias,
- transA,
- transB,
- scale);
+
+ return cpu::variant::affine(clip(a, clipValue),
+ packed,
+ b->shape(),
+ bias,
+ transA,
+ transB,
+ scale);
} else {
int rows = a->shape().elements() / a->shape()[-1];
Expr ones = a->graph()->ones({rows, 1});
@@ -538,7 +550,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
}
#else
- ABORT("Packed GEMM not implemented");
+ ABORT("Packed GEMM is not available in this build");
#endif // USE_FBGEMM
} else if(gemmType == GemmType::MklFp32) {
@@ -555,7 +567,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
= {clip(a, clipValue), clip(b, clipValue), bias, ones};
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
} else {
- ABORT("Not implemented");
+ ABORT("GemmType..{} not available by affine()", gemmType);
}
}
} else {
diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 5809c18c..d46273b0 100644
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -72,7 +72,7 @@ public:
// Use packed GEMM for the production
graph_->getBackend()->setOptimized(true);
- graph_->getBackend()->setGemmType("packed");
+ graph_->getBackend()->setGemmType("fp16packed");
#ifdef MKL_FOUND
mkl_set_num_threads(options->get<int>("mkl-threads", 1));
diff --git a/src/models/transformer_stub.cpp b/src/models/transformer_stub.cpp
index 2e8d694f..871ee009 100755
--- a/src/models/transformer_stub.cpp
+++ b/src/models/transformer_stub.cpp
@@ -1,4 +1,4 @@
-#include "models/transformer.h"
+#include "models/transformer.h"
namespace marian {
// factory functions
diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
index 094e9331..8d4adcfc 100644
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@@ -27,9 +27,11 @@ public:
if (gemmType == "auto") gemmType_ = GemmType::Auto;
else if (gemmType == "mklfp32") gemmType_ = GemmType::MklFp32;
else if (gemmType == "intrinint16") gemmType_ = GemmType::IntrinInt16;
+#if USE_FBGEMM
else if (gemmType == "fp16packed") gemmType_ = GemmType::FbFp16Packed;
else if (gemmType == "int8packed") gemmType_ = GemmType::FbInt8Packed;
- else ABORT("Unknown GEMM type");
+#endif // USE_FBGEMM
+ else ABORT("Unknown GEMM type - '{}'", gemmType);
}
GemmType getGemmType() override { return gemmType_; }
};
diff --git a/src/tensors/cpu/expanded_gemm.h b/src/tensors/cpu/expanded_gemm.h
index 03e58ae4..8ccdb415 100644
--- a/src/tensors/cpu/expanded_gemm.h
+++ b/src/tensors/cpu/expanded_gemm.h
@@ -157,7 +157,7 @@ public:
return {NodeOp(0)};
}
- const std::string type() override { return "affinePacked"; }
+ const std::string type() override { return "fp16packed"; }
};
static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {